bwshen-mi commited on 6 days ago

Commit

8715df4

0 Parent(s):

Duplicate from XiaomiMiMo/MiMo-V2.5-Pro

Browse files

Co-authored-by: Bowen Shen <bwshen-mi@users.noreply.huggingface.co>

Files changed (49) hide show

.gitattributes +40 -0
README.md +218 -0
added_tokens.json +28 -0
assets/architecture.png +3 -0
assets/benchmark.jpg +3 -0
assets/post_training_evaluation.jpg +3 -0
config.json +156 -0
configuration_mimo_v2.py +209 -0
merges.txt +0 -0
model.safetensors.index.json +3 -0
model_mtp.safetensors +3 -0
model_pp0_ep0_shard0.safetensors +3 -0
model_pp0_ep0_shard1.safetensors +3 -0
model_pp0_ep10_shard0.safetensors +3 -0
model_pp0_ep11_shard0.safetensors +3 -0
model_pp0_ep12_shard0.safetensors +3 -0
model_pp0_ep13_shard0.safetensors +3 -0
model_pp0_ep14_shard0.safetensors +3 -0
model_pp0_ep15_shard0.safetensors +3 -0
model_pp0_ep16_shard0.safetensors +3 -0
model_pp0_ep17_shard0.safetensors +3 -0
model_pp0_ep18_shard0.safetensors +3 -0
model_pp0_ep19_shard0.safetensors +3 -0
model_pp0_ep1_shard0.safetensors +3 -0
model_pp0_ep20_shard0.safetensors +3 -0
model_pp0_ep21_shard0.safetensors +3 -0
model_pp0_ep22_shard0.safetensors +3 -0
model_pp0_ep23_shard0.safetensors +3 -0
model_pp0_ep24_shard0.safetensors +3 -0
model_pp0_ep25_shard0.safetensors +3 -0
model_pp0_ep26_shard0.safetensors +3 -0
model_pp0_ep27_shard0.safetensors +3 -0
model_pp0_ep28_shard0.safetensors +3 -0
model_pp0_ep29_shard0.safetensors +3 -0
model_pp0_ep2_shard0.safetensors +3 -0
model_pp0_ep30_shard0.safetensors +3 -0
model_pp0_ep31_shard0.safetensors +3 -0
model_pp0_ep3_shard0.safetensors +3 -0
model_pp0_ep4_shard0.safetensors +3 -0
model_pp0_ep5_shard0.safetensors +3 -0
model_pp0_ep6_shard0.safetensors +3 -0
model_pp0_ep7_shard0.safetensors +3 -0
model_pp0_ep8_shard0.safetensors +3 -0
model_pp0_ep9_shard0.safetensors +3 -0
modeling_mimo_v2.py +697 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +240 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,40 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/architecture.png filter=lfs diff=lfs merge=lfs -text
+assets/benchmark.jpg filter=lfs diff=lfs merge=lfs -text
+assets/post_training_evaluation.jpg filter=lfs diff=lfs merge=lfs -text
+model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,218 @@

+---
+license: mit
+language:
+- en
+- zh
+tags:
+- text-generation
+- agent
+- long-context
+- code
+---
+<br/><br/>
+<div align="center">
+  <picture>
+    <source srcset="https://github.com/XiaomiMiMo/MiMo/raw/main/figures/Xiaomi_MiMo_darkmode.png?raw=true" media="(prefers-color-scheme: dark)">
+    <img src="https://github.com/XiaomiMiMo/MiMo/raw/main/figures/Xiaomi_MiMo.png?raw=true" width="60%" alt="Xiaomi-MiMo" />
+  </picture>
+</div>
+<br/>
+<div align="center" style="line-height: 1;">
+  |
+  <a href="https://huggingface.co/XiaomiMiMo" target="_blank">🤗 HuggingFace</a>
+  &nbsp;|
+  <a href="https://mimo.xiaomi.com/mimo-v2-5-pro" target="_blank">📰 Blog </a>
+  &nbsp;|
+  <a href="https://platform.xiaomimimo.com/" target="_blank">🎨 Xiaomi MiMo API Platform </a>
+  &nbsp;|
+  <a href="https://aistudio.xiaomimimo.com" target="_blank">🗨️ Xiaomi MiMo Studio </a>
+  &nbsp;|
+</div>
+<br/>
+<div align="center" style="line-height: 1.2;">
+  <strong>Community</strong><br/>
+  <a href="https://work.weixin.qq.com/apph5/external_room/join/group_mng?plg_id=c417f99bd9014b5dd894daa8bfe19790&" target="_blank">WeChat Group</a>
+  &nbsp;|&nbsp;
+  <a href="https://discord.gg/WX2R2uNp" target="_blank">Discord</a>
+  &nbsp;|&nbsp;
+  <a href="https://t.me/+3T-I0pekOVIyNDBl" target="_blank">Telegram</a>
+  &nbsp;|&nbsp;
+  <a href="https://www.reddit.com/r/XiaomiMiMo_Official/" target="_blank">Reddit</a>
+</div>
+<br/>
+# MiMo-V2.5-Pro
+MiMo-V2.5-Pro is an open-source Mixture-of-Experts (MoE) language model with 1.02T total parameters and 42B active parameters. It utilizes the hybrid attention architecture and 3-layers Multi-Token Prediction (MTP) introduced in [MiMo-V2-Flash](https://github.com/XiaomiMiMo/MiMo-V2-Flash), with up to 1M tokens context length.
+<div align="center">
+  <img src="assets/benchmark.jpg" width="90%" alt="Benchmark Results" />
+</div>
+## 1. Introduction
+MiMo-V2.5-Pro is our most capable model to date, designed for the most demanding agentic, complex software engineering, and long-horizon tasks. It sustains complex trajectories spanning thousands of tool calls with strong instruction following and coherence over a 1M-token context window. Key features include:
+- **Hybrid Attention Architecture**: Interleaves Sliding Window Attention (SWA) and Global Attention (GA) with a 6:1 ratio and 128 sliding window. This reduces KV-cache storage by nearly 7x while maintaining long-context performance via learnable attention sink bias.
+- **Multi-Token Prediction (MTP)**: Equipped with three lightweight MTP modules using dense FFNs. This triples output speed during inference and will be good to accelerate rollout in RL training.
+- **Efficient Pre-Training**: Trained on 27T tokens using FP8 mixed precision and native 32k seq length. The context window supports up to 1M tokens.
+- **Agentic Capabilities**: Post-training utilizes SFT, large-scale agentic RL and Multi-Teacher On-Policy Distillation (MOPD), achieving superior performance on the most demanding agentic, complex software engineering, and long-horizon tasks.
+## 2. Model Downloads
+| Model | Total Params | Active Params | Context Length | Precision | Download |
+| :--- | :---: | :---: | :---: | :---: | :---: |
+| **MiMo-V2.5-Pro** | 1.02T | 42B | 1M | FP8 (E4M3) Mixed | [🤗 HuggingFace](https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro) <br> [🤖 ModelScope](https://modelscope.cn/models/XiaomiMiMo/MiMo-V2.5-Pro) |
+| **MiMo-V2.5-Pro-Base** | 1.02T | 42B | 256K | FP8 (E4M3) Mixed | [🤗 HuggingFace](https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro-Base) <br> [🤖 ModelScope](https://modelscope.cn/models/XiaomiMiMo/MiMo-V2.5-Pro-Base) |
+## 3. Evaluation Results
+### Base Model Evaluation
+| Category | Benchmark | Setting | MiMo-V2.5-Pro Base | MiMo-V2.5 Base | DeepSeek-V4-Pro Base | DeepSeek-V4-Flash Base | Kimi-K2 Base |
+| :--- | :--- | :---: | :---: | :---: | :---: | :---: | :---: |
+| **Params** | #Activated / #Total | - | 42B / 1.02T | 15B / 310B | 49B / 1.6T | 13B / 284B | 32B / 1.04T |
+| **General** | BBH | 3-shot | 88.4 | 87.2 | 87.5 | 86.9 | 88.7 |
+| | MMLU | 5-shot | 89.4 | 86.3 | 90.1 | 88.7 | 87.8 |
+| | MMLU-Redux | 5-shot | 92.8 | 89.8 | 90.8 | 89.4 | 90.2 |
+| | MMLU-Pro | 5-shot | 68.5 | 65.8 | 73.5 | 68.3 | 69.2 |
+| | DROP | 3-shot | 86.3 | 83.7 | 88.7 | 88.6 | 83.6 |
+| | ARC-Challenge | 25-shot | 97.2 | 96.5 | - | - | 96.2 |
+| | HellaSwag | 10-shot | 89.8 | 88.6 | 88.0 | 85.7 | 94.6 |
+| | WinoGrande | 5-shot | 85.6 | 84.7 | 81.5 | 79.5 | 85.3 |
+| | TriviaQA | 5-shot | 81.3 | 80.7 | 85.6 | 82.8 | 85.1 |
+| | GPQA-Diamond | 5-shot | 66.7 | 58.1 | - | - | 48.1 |
+| **Math** | GSM8K | 8-shot | 99.6 | 83.3 | 92.6 | 90.8 | 92.1 |
+| | MATH | 4-shot | 86.2 | 67.7 | 64.5 | 57.4 | 70.2 |
+| | AIME 24&25 | 2-shot | 37.3 | 36.9 | - | - | 31.6 |
+| **Code** | HumanEval+ | 1-shot | 75.6 | 71.3 | - | - | 84.8 |
+| | MBPP+ | 3-shot | 74.1 | 70.9 | - | - | 73.8 |
+| | LiveCodeBench v6 | 1-shot | 39.6 | 35.5 | - | - | 26.3 |
+| | SWE-Bench (AgentLess) | 3-shot | 35.7 | 30.8 | - | - | 28.2 |
+| **Chinese** | C-Eval | 5-shot | 91.5 | 88.6 | 93.1 | 92.1 | 92.5 |
+| | CMMLU | 5-shot | 90.2 | 88.2 | 90.8 | 90.4 | 90.9 |
+| **Multilingual** | GlobalMMLU | 5-shot | 83.6 | 77.4 | - | - | 80.7 |
+### Long-context Evaluation
+<div align="center">
+  <img src="assets/post_training_evaluation.jpg" width="80%" alt="Post-training Evaluation" />
+</div>
+GraphWalks is a long-context benchmark from OpenAI that fills the prompt with a directed graph of hex-hash nodes and asks the model to run a breadth-first search (nodes exactly at depth *N*) or list a node's parents. We evaluate across the full 32k–1M input-token span and apply the same evaluation fixes described by Anthropic.
+MiMo V2.5 Pro delivers a major leap in long-context reasoning. Past 128k, V2 Pro degrades rapidly and collapses to 0.00 at 1M on both subtasks, while V2.5 Pro still scores 0.56 BFS / 0.92 Parents at 512k and 0.37 / 0.62 at 1M.
+## 4. Model Architecture & Training Process
+MiMo-V2.5-Pro addresses the quadratic complexity of long contexts by interleaving Local Sliding Window Attention (SWA) and Global Attention (GA). Unlike traditional speculative decoding, our MTP module is natively integrated for training and inference.
+<div align="center">
+  <img src="assets/architecture.png" width="60%" alt="Model Architecture" />
+</div>
+### Model Summary
+| Component | MiMo-V2.5-Pro | MiMo-V2.5 |
+| :--- | :---: | :---: |
+| **Total Parameters** | 1.02T | 310B |
+| **Activated Parameters** | 42B | 15B |
+| **Hidden Size** | 6144 | 4096 |
+| **Num Layers** | 70 (1 dense + 69 MoE) | 48 (1 dense + 47 MoE)|
+| **Full Attention Layers** | 10 | 9 |
+| **SWA Layers** | 60 | 39 |
+| **Num Attention Heads** | 128 | 64 |
+| **Num KV Heads** | 8 (GQA) | 8 (GA) / 4 (SWA) |
+| **Head Dim (QK / V)** | 192 / 128 | 192 / 128 |
+| **Routed Experts** | 384 | 256 |
+| **Experts per Token** | 8 | 8 |
+| **MoE Intermediate Size** | 2048 | 2048 |
+| **Dense Intermediate Size** | 16384 (layer 0 only) | 16384 (layer 0 only) |
+| **SWA Window Size** | 128 | 128 |
+| **Max Context Length** | 1M | 1M |
+| **MTP Layers** | 3 | 3 |
+### Training Process
+For post-training, MiMo-V2.5-Pro adopts the three-stage post-training paradigm introduced in [MiMo-V2-Flash](https://github.com/XiaomiMiMo/MiMo-V2-Flash) to achieve exceptional performance. The paradigm begins with Supervised Fine-Tuning (SFT) to build strong, foundational instruction-following skills using curated data pairs. Next, in the Domain-Specialized Training stage, diverse teacher models — ranging from math and safety to complex agentic tool-use — are individually optimized using domain-specific RL rewards. Finally, the process culminates in Multi-Teacher On-Policy Distillation (MOPD). Through dynamic on-policy RL, the single student model iteratively learns from its own outputs, continuously receiving precise token-level guidance from the expert teachers to seamlessly integrate broad capabilities.
+## 5. Deployment
+Since inference engines are continuously being updated and optimized, this guide only provides deployment examples for reference. For the best performance, we strongly recommend following our referenced approach to get the latest best practices and optimal performance.
+### SGLang Deployment
+For the best performance, we strongly recommend deploying using this approach, which is officially supported by the SGLang community. Please refer to [SGLang MiMo-V2.5-Pro Cookbook](https://docs.sglang.io/cookbook/autoregressive/Xiaomi/MiMo-V2.5) for the latest deployment guide.
+The following is an example of running the model with SGLang, referenced from [sgl-project/sglang#23808](https://github.com/sgl-project/sglang/pull/23808):
+```bash
+SGLANG_ENABLE_SPEC_V2=1
+SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+python3 -m sglang.launch_server \
+              --model-path XiaomiMiMo/MiMo-V2.5-Pro \
+              --trust-remote-code \
+              --pp-size 1 \
+              --dp-size 2 \
+              --ep-size 16 \
+              --tp-size 16 \
+              --moe-dense-tp-size 1 \
+              --enable-dp-attention \
+              --moe-a2a-backend deepep \
+              --dist-init-addr ${LWS_LEADER_IP}:20000 \
+              --node-rank ${LWS_WORKER_INDEX} \
+              --nnodes ${LWS_GROUP_SIZE} \
+              --page-size 64 \
+              --attention-backend fa3 \
+              --quantization fp8 \
+              --mem-fraction-static 0.7 \
+              --max-running-requests 128 \
+              --cuda-graph-max-bs 64 \
+              --chunked-prefill-size 32768 \
+              --context-length 1048576 \
+              --tokenizer-worker-num 64 \
+              --speculative-algorithm EAGLE \
+              --speculative-num-steps 3 \
+              --speculative-eagle-topk 1 \
+              --speculative-num-draft-tokens 4 \
+              --enable-multi-layer-eagle \
+              --host 0.0.0.0 \
+              --port 9001 \
+              --reasoning-parser mimo \
+              --tool-call-parser mimo \
+              --watchdog-timeout 3600 \
+              --model-loader-extra-config '{"enable_multithread_load": "true","num_threads": 64}'
+```
+### vLLM Deployment
+For the best performance, we strongly recommend deploying using this approach, which is officially supported by the vLLM community. Please refer to [vLLM MiMo-V2.5-Pro Cookbook](https://recipes.vllm.ai/XiaomiMiMo/MiMo-V2.5-Pro) for the latest deployment guide.
+For local deployment, we recommend setting the sampling parameters to `temperature=1.0`, `top_p=0.95`.
+## Citation
+```bibtex
+@misc{mimo2026v25pro,
+  title={MiMo-V2.5-Pro},
+  author={{Xiaomi MiMo Team}},
+  year={2026},
+  howpublished={\url{https://huggingface.co/collections/XiaomiMiMo/mimo-v25}},
+}
+```
+## Contact
+For questions or feedback, reach us at [mimo@xiaomi.com](mailto:mimo@xiaomi.com) or join our community:
+- [WeChat Group](https://work.weixin.qq.com/apph5/external_room/join/group_mng?plg_id=c417f99bd9014b5dd894daa8bfe19790&)
+- [Discord](https://discord.gg/WX2R2uNp)
+- [Telegram](https://t.me/+3T-I0pekOVIyNDBl)
+- [Reddit](https://www.reddit.com/r/XiaomiMiMo_Official/)

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

assets/architecture.png ADDED Viewed

Git LFS Details

SHA256: 364418932cbfeb4757e7f7bc91bd3df19de6fca21640c3c0bbda89a569358e0d
Pointer size: 131 Bytes
Size of remote file: 152 kB

assets/benchmark.jpg ADDED Viewed

Git LFS Details

SHA256: 5b4e4a2c797c9ab879249f88b5b2341322391f71c689941a70e075b66401a6c6
Pointer size: 131 Bytes
Size of remote file: 624 kB

assets/post_training_evaluation.jpg ADDED Viewed

Git LFS Details

SHA256: f8b9ea21cb0b35905395f4af80aa6952fce1367b388af82d67ab89233e2674a4
Pointer size: 131 Bytes
Size of remote file: 216 kB

config.json ADDED Viewed

	@@ -0,0 +1,156 @@

+{
+  "architectures": [
+    "MiMoV2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mimo_v2.MiMoV2Config",
+    "AutoModel": "modeling_mimo_v2.MiMoV2Model",
+    "AutoModelForCausalLM": "modeling_mimo_v2.MiMoV2ForCausalLM"
+  },
+  "quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "weight_block_size": [
+      128,
+      128
+    ],
+    "ignored_layers": [
+      "model.layers.0.self_attn.o_proj",
+      "model.layers.1.self_attn.o_proj",
+      "model.layers.2.self_attn.o_proj",
+      "model.layers.3.self_attn.o_proj",
+      "model.layers.4.self_attn.o_proj",
+      "model.layers.5.self_attn.o_proj",
+      "model.layers.6.self_attn.o_proj",
+      "model.layers.7.self_attn.o_proj",
+      "model.layers.8.self_attn.o_proj",
+      "model.layers.9.self_attn.o_proj",
+      "model.layers.10.self_attn.o_proj",
+      "model.layers.11.self_attn.o_proj",
+      "model.layers.12.self_attn.o_proj",
+      "model.layers.13.self_attn.o_proj",
+      "model.layers.14.self_attn.o_proj",
+      "model.layers.15.self_attn.o_proj",
+      "model.layers.16.self_attn.o_proj",
+      "model.layers.17.self_attn.o_proj",
+      "model.layers.18.self_attn.o_proj",
+      "model.layers.19.self_attn.o_proj",
+      "model.layers.20.self_attn.o_proj",
+      "model.layers.21.self_attn.o_proj",
+      "model.layers.22.self_attn.o_proj",
+      "model.layers.23.self_attn.o_proj",
+      "model.layers.24.self_attn.o_proj",
+      "model.layers.25.self_attn.o_proj",
+      "model.layers.26.self_attn.o_proj",
+      "model.layers.27.self_attn.o_proj",
+      "model.layers.28.self_attn.o_proj",
+      "model.layers.29.self_attn.o_proj",
+      "model.layers.30.self_attn.o_proj",
+      "model.layers.31.self_attn.o_proj",
+      "model.layers.32.self_attn.o_proj",
+      "model.layers.33.self_attn.o_proj",
+      "model.layers.34.self_attn.o_proj",
+      "model.layers.35.self_attn.o_proj",
+      "model.layers.36.self_attn.o_proj",
+      "model.layers.37.self_attn.o_proj",
+      "model.layers.38.self_attn.o_proj",
+      "model.layers.39.self_attn.o_proj",
+      "model.layers.40.self_attn.o_proj",
+      "model.layers.41.self_attn.o_proj",
+      "model.layers.42.self_attn.o_proj",
+      "model.layers.43.self_attn.o_proj",
+      "model.layers.44.self_attn.o_proj",
+      "model.layers.45.self_attn.o_proj",
+      "model.layers.46.self_attn.o_proj",
+      "model.layers.47.self_attn.o_proj",
+      "model.layers.48.self_attn.o_proj",
+      "model.layers.49.self_attn.o_proj",
+      "model.layers.50.self_attn.o_proj",
+      "model.layers.51.self_attn.o_proj",
+      "model.layers.52.self_attn.o_proj",
+      "model.layers.53.self_attn.o_proj",
+      "model.layers.54.self_attn.o_proj",
+      "model.layers.55.self_attn.o_proj",
+      "model.layers.56.self_attn.o_proj",
+      "model.layers.57.self_attn.o_proj",
+      "model.layers.58.self_attn.o_proj",
+      "model.layers.59.self_attn.o_proj",
+      "model.layers.60.self_attn.o_proj",
+      "model.layers.61.self_attn.o_proj",
+      "model.layers.62.self_attn.o_proj",
+      "model.layers.63.self_attn.o_proj",
+      "model.layers.64.self_attn.o_proj",
+      "model.layers.65.self_attn.o_proj",
+      "model.layers.66.self_attn.o_proj",
+      "model.layers.67.self_attn.o_proj",
+      "model.layers.68.self_attn.o_proj",
+      "model.layers.69.self_attn.o_proj",
+      "model.decoder.self_attn.o_proj"
+    ]
+  },
+  "add_full_attention_sink_bias": false,
+  "add_swa_attention_sink_bias": true,
+  "attention_bias": false,
+  "attention_chunk_size": 128,
+  "attention_dropout": 0.0,
+  "attention_projection_layout": "fused_qkv",
+  "attention_value_scale": 0.612,
+  "head_dim": 192,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "hybrid_layer_pattern": [
+    0, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1,
+    0
+  ],
+  "initializer_range": 0.02,
+  "intermediate_size": 16384,
+  "layernorm_epsilon": 1e-05,
+  "max_position_embeddings": 1048576,
+  "model_type": "mimo_v2",
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": [
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+  ],
+  "n_group": 1,
+  "n_routed_experts": 384,
+  "n_shared_experts": null,
+  "norm_topk_prob": true,
+  "num_attention_heads": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 70,
+  "num_key_value_heads": 8,
+  "partial_rotary_factor": 0.334,
+  "rope_theta": 10000000,
+  "routed_scaling_factor": null,
+  "scoring_func": "sigmoid",
+  "sliding_window": 128,
+  "sliding_window_size": 128,
+  "swa_head_dim": 192,
+  "swa_num_attention_heads": 128,
+  "swa_num_key_value_heads": 8,
+  "swa_rope_theta": 10000,
+  "swa_v_head_dim": 128,
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "topk_method": "noaux_tc",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "v_head_dim": 128,
+  "vocab_size": 152576
+}

configuration_mimo_v2.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# coding=utf-8
+#
+# Copyright 2026 Xiaomi Corporation.
+# Copyright 2026 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+_MIMOV2_ATTENTION_PROJECTION_LAYOUTS = {"split", "fused_qkv"}
+_MIMOV2_SPLIT_TP_PLAN = {
+    "layers.*.self_attn.q_proj": "colwise",
+    "layers.*.self_attn.k_proj": "colwise",
+    "layers.*.self_attn.v_proj": "colwise",
+    "layers.*.self_attn.o_proj": "rowwise",
+    "layers.*.mlp.gate_proj": "colwise",
+    "layers.*.mlp.up_proj": "colwise",
+    "layers.*.mlp.down_proj": "rowwise",
+}
+_MIMOV2_FUSED_QKV_TP_PLAN = {
+    "layers.*.self_attn.qkv_proj": "colwise",
+    "layers.*.self_attn.o_proj": "rowwise",
+    "layers.*.mlp.gate_proj": "colwise",
+    "layers.*.mlp.up_proj": "colwise",
+    "layers.*.mlp.down_proj": "rowwise",
+}
+_MIMOV2_PP_PLAN = {
+    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+    "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+    "norm": (["hidden_states"], ["hidden_states"]),
+}
+class MiMoV2Config(PretrainedConfig):
+    model_type = "mimo_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = _MIMOV2_SPLIT_TP_PLAN
+    base_model_pp_plan = _MIMOV2_PP_PLAN
+    attribute_map = {
+        "num_local_experts": "n_routed_experts",
+    }
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        layernorm_epsilon=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_dropout=0.0,
+        attention_bias=False,
+        attention_value_scale=None,
+        head_dim=None,
+        v_head_dim=None,
+        swa_num_attention_heads=None,
+        swa_num_key_value_heads=None,
+        swa_head_dim=None,
+        swa_v_head_dim=None,
+        swa_rope_theta=None,
+        sliding_window=None,
+        sliding_window_size=None,
+        add_full_attention_sink_bias=False,
+        add_swa_attention_sink_bias=False,
+        hybrid_block_size=None,
+        hybrid_layer_pattern=None,
+        partial_rotary_factor=1.0,
+        n_routed_experts=None,
+        moe_intermediate_size=None,
+        num_experts_per_tok=None,
+        routed_scaling_factor=None,
+        scoring_func="sigmoid",
+        topk_method="noaux_tc",
+        n_group=None,
+        topk_group=None,
+        norm_topk_prob=True,
+        moe_layer_freq=None,
+        attention_projection_layout="split",
+        **kwargs,
+    ):
+        rope_parameters = kwargs.pop("rope_parameters", None)
+        if rope_scaling is None and rope_parameters is not None:
+            rope_scaling = rope_parameters
+        if attention_projection_layout is None:
+            attention_projection_layout = "split"
+        if attention_projection_layout not in _MIMOV2_ATTENTION_PROJECTION_LAYOUTS:
+            raise ValueError(f"Unsupported MiMoV2 attention projection layout: {attention_projection_layout}")
+        self.attention_projection_layout = attention_projection_layout
+        self.base_model_tp_plan = (
+            _MIMOV2_FUSED_QKV_TP_PLAN.copy()
+            if attention_projection_layout == "fused_qkv"
+            else _MIMOV2_SPLIT_TP_PLAN.copy()
+        )
+        self.base_model_pp_plan = _MIMOV2_PP_PLAN.copy()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        if num_attention_heads % num_key_value_heads != 0:
+            raise ValueError("num_attention_heads must be divisible by num_key_value_heads")
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layernorm_epsilon = layernorm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.attention_bias = attention_bias
+        self.attention_value_scale = attention_value_scale
+        self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
+        self.v_head_dim = v_head_dim if v_head_dim is not None else self.head_dim
+        self.swa_num_attention_heads = (
+            swa_num_attention_heads if swa_num_attention_heads is not None else num_attention_heads
+        )
+        self.swa_num_key_value_heads = (
+            swa_num_key_value_heads if swa_num_key_value_heads is not None else num_key_value_heads
+        )
+        if self.swa_num_attention_heads % self.swa_num_key_value_heads != 0:
+            raise ValueError("swa_num_attention_heads must be divisible by swa_num_key_value_heads")
+        self.swa_head_dim = swa_head_dim if swa_head_dim is not None else self.head_dim
+        self.swa_v_head_dim = swa_v_head_dim if swa_v_head_dim is not None else self.swa_head_dim
+        self.swa_rope_theta = swa_rope_theta if swa_rope_theta is not None else rope_theta
+        if sliding_window is None:
+            sliding_window = sliding_window_size
+        self.sliding_window = sliding_window
+        self.sliding_window_size = sliding_window_size if sliding_window_size is not None else sliding_window
+        self.add_full_attention_sink_bias = add_full_attention_sink_bias
+        self.add_swa_attention_sink_bias = add_swa_attention_sink_bias
+        if hybrid_block_size is not None and hybrid_layer_pattern is None:
+            hybrid_layer_pattern = [0 if ((i + 1) % hybrid_block_size == 0) else 1 for i in range(num_hidden_layers)]
+        elif hybrid_layer_pattern is None:
+            hybrid_layer_pattern = [0] * num_hidden_layers
+        if len(hybrid_layer_pattern) != num_hidden_layers:
+            raise ValueError("hybrid_layer_pattern length must match num_hidden_layers")
+        self.hybrid_block_size = hybrid_block_size
+        self.hybrid_layer_pattern = hybrid_layer_pattern
+        self.partial_rotary_factor = partial_rotary_factor
+        self.n_routed_experts = n_routed_experts
+        self.moe_intermediate_size = moe_intermediate_size if moe_intermediate_size is not None else intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.routed_scaling_factor = routed_scaling_factor
+        self.scoring_func = scoring_func
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.norm_topk_prob = norm_topk_prob
+        if isinstance(moe_layer_freq, int):
+            moe_layer_freq = [moe_layer_freq > 0 and i % moe_layer_freq == 0 for i in range(num_hidden_layers)]
+        elif moe_layer_freq is None:
+            moe_layer_freq = [False] * num_hidden_layers
+        if len(moe_layer_freq) != num_hidden_layers:
+            raise ValueError("moe_layer_freq length must match num_hidden_layers")
+        self.moe_layer_freq = moe_layer_freq
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["MiMoV2Config"]

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ad2187407f8cf989f140a42d9bf340be7a220b2f6b35afa0a728774eb20f4b
+size 15569001

model_mtp.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0f1599b41996feab60b2b2ea42338b26979a2b58b115526bb49371a91e3c8d8
+size 2463641280

model_pp0_ep0_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3aac22ecd430136c92626bbf1345dc9e99d9800a4cd49878793b52a6154cd40
+size 34554911640

model_pp0_ep0_shard1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6422d714cb680c72ebc5982b95252f5457e7f4625ceb29fad5e05aecc10c3b57
+size 27180576088

model_pp0_ep10_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf18b588ea10bead2879179f15bbca4e91e9abd59863581e2cd9004470d2d54d
+size 31264218696

model_pp0_ep11_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a0b5d0ef3523bd3ce8fe2eab1bb226ce5122bf72fb30cf7bbe6e8f99459bb90
+size 31264218696

model_pp0_ep12_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8f63229280884fc677945fee04391246c3d4d7aacf90f327ef430ab62bc72c6
+size 31264218696

model_pp0_ep13_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93704b69e4f0e0fa148e2118f605518fddb62dbda17d4575918aa5f075dfb890
+size 31264218696

model_pp0_ep14_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ebf87de52e82194428200f82e28c666bf42639c43d32b12218041ec67e4dccb
+size 31264218696

model_pp0_ep15_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffbe58cf0bf67a61481ea7d5bb6f6dd310c546c5bed21ff45db264fbb5ffca46
+size 31264218696

model_pp0_ep16_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d5028da852f2ae339e047deb0f4dbdcb5770ffbc411bc130947b40b6cc7c0f4
+size 31264218696

model_pp0_ep17_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3ac44fff1a24c635d28ce4addc84589cb6638d6284f2e17a502b4b23dc0d9fa
+size 31264218696

model_pp0_ep18_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:701682b0e073f471a47d3a61b1adb13d7e1183eacd053be2aafd6e33c2f1efb9
+size 31264218696

model_pp0_ep19_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:653d5ed28d8320369708a145b209d0fdb6cb03673351a1cc92eff163dfe6757e
+size 31264218696

model_pp0_ep1_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ae084dd12811777d1da10784235d5014a21a6b22fe936e70d0ab4682c336d7
+size 31264213728

model_pp0_ep20_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ae1f90c36626cb3944ed4986f7df3181b761be5658e9cd561691dab946b1f28
+size 31264218696

model_pp0_ep21_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25954cf458b4e149523d7f436ef404d51c6cc7d4fa6d2a12bfaa9e4fb7fb147c
+size 31264218696

model_pp0_ep22_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb181bc5d65f77090b8423503c31895ed122d041bb6cf6d4a7b7846744d6be05
+size 31264218696

model_pp0_ep23_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f67f2343e3b92066428bea0fb39eccc68d3f587ff1ad274478e3d369120fb61b
+size 31264218696

model_pp0_ep24_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:588c6199211b4b947b53ce0fa93df4597a2da0090a4ca637e69e70428ef1780c
+size 31264218696

model_pp0_ep25_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c845e96be226275597955cfe995ee41fa6b7eafd3a2e579595efadaca961c8f0
+size 31264218696

model_pp0_ep26_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30f987f808058ba27447d8200e5c0d41df64a2d9525159f8ab9c5f635d69509a
+size 31264218696

model_pp0_ep27_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b151aeb375b84ba967792207e8bd4e0c7ede979534f8648d3f5fa2bb88659ac
+size 31264218696

model_pp0_ep28_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2538741e85b3430549f389be672d8cda5384f88f0f92af9d83442c37936ddd14
+size 31264218696

model_pp0_ep29_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf56b7502b8aabd8d0779978e9ad03f4905715c2654cd9c1218571ea76f28a25
+size 31264218696

model_pp0_ep2_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4df6ddf8e6c3ced5d78f11a47c0f425b898cdb92acb31675bbb93de6aeeb52f6
+size 31264213728

model_pp0_ep30_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e3aa954fa6729a12a773800fdfbd5a71dae18e30e4a4b7177d430173d541152
+size 31264218696

model_pp0_ep31_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc4350c6f373209df4f49755ae06ede1a1c1ebed2f1fc98778e71c2b180de480
+size 31264218696

model_pp0_ep3_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3696b210440cfe93b533e0945263203d917e01a9b8e8921845b5e262a69875de
+size 31264213728

model_pp0_ep4_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b042d23818e9d537e48b63cfa437ddc2be152f2a588fc5bd678acf1c2f67bc51
+size 31264213728

model_pp0_ep5_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd6e5884dae5be62fdcd1bf843281c7f7dbd671f461e57d0764cf1803afd5aa4
+size 31264213728

model_pp0_ep6_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbde11cae32b38026481129d053b976842958da86f40abf91fbcd91387b9ae67
+size 31264213728

model_pp0_ep7_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff4bc2f37ab4496349874eff47ed0b253aab854983c597b040d6912bfa730eb
+size 31264213728

model_pp0_ep8_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bf1dc12b959388fb6355718d9217457f8564a35145c52200e778952542479e1
+size 31264217040

model_pp0_ep9_shard0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d7ce5137511e43470781806571b3575246c55e69dd22d05acb85a603133b1a
+size 31264218696

modeling_mimo_v2.py ADDED Viewed

	@@ -0,0 +1,697 @@

+# coding=utf-8
+#
+# Copyright 2026 Xiaomi Corporation.
+# Copyright 2026 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import copy
+from typing import Callable, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
+from .configuration_mimo_v2 import MiMoV2Config
+logger = logging.get_logger(__name__)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies rotary position embedding to query and key tensors."""
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    sinks: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    if sinks is not None:
+        sinks = module.attention_sink_bias.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
+        attn_weights = torch.cat([attn_weights, sinks], dim=-1)
+    attn_weights = attn_weights - attn_weights.max(dim=-1, keepdim=True).values
+    probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    if sinks is not None:
+        probs = probs[..., :-1]
+    attn_weights = nn.functional.dropout(probs, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@use_kernel_forward_from_hub("RMSNorm")
+class MiMoV2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class MiMoV2MLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+class MiMoV2MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor if config.routed_scaling_factor is not None else 1.0
+        self.scoring_func = config.scoring_func
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        else:
+            raise NotImplementedError(f"Unsupported scoring function for MoE gating: {self.scoring_func}")
+        if self.topk_method == "noaux_tc":
+            if self.training:
+                raise ValueError("MiMoV2 noaux_tc routing is only implemented for inference.")
+            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+            group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+            group_mask = torch.zeros_like(group_scores)
+            group_mask.scatter_(1, group_idx, 1)
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
+                .reshape(bsz * seq_len, -1)
+            )
+            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))
+            _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+            topk_weight = scores.gather(1, topk_idx)
+        else:
+            raise NotImplementedError(f"Unsupported TopK function for MoE gating: {self.topk_method}")
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = topk_weight * self.routed_scaling_factor
+        return topk_idx, topk_weight
+class MiMoV2MoE(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [MiMoV2MLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(config.n_routed_experts)]
+        )
+        self.gate = MiMoV2MoEGate(config)
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+        for expert_idx, expert in enumerate(self.experts):
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                final_hidden_states.index_add_(0, token_indices, expert_output * expert_weights.unsqueeze(-1))
+        return final_hidden_states.type(hidden_states.dtype)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        return hidden_states
+class MiMoV2Attention(nn.Module):
+    """MiMoV2 attention.
+    `projection_layout` only controls how checkpoint weights are named and
+    stored: Flash uses separate q/k/v projections, while Pro uses fused qkv.
+    The attention computation after projection is shared.
+    """
+    def __init__(self, config, is_swa: bool, layer_idx: int, projection_layout: str = "split"):
+        super().__init__()
+        if projection_layout not in {"split", "fused_qkv"}:
+            raise ValueError(f"Unsupported MiMoV2 attention projection layout: {projection_layout}")
+        self.config = config
+        self.layer_idx = layer_idx
+        self.is_swa = is_swa
+        self.is_causal = True
+        self.projection_layout = projection_layout
+        default_head_dim = config.hidden_size // config.num_attention_heads
+        default_v_head_dim = getattr(config, "v_head_dim", default_head_dim)
+        if is_swa:
+            self.head_dim = getattr(config, "swa_head_dim", getattr(config, "head_dim", default_head_dim))
+            self.v_head_dim = getattr(config, "swa_v_head_dim", default_v_head_dim)
+            self.num_attention_heads = getattr(config, "swa_num_attention_heads", config.num_attention_heads)
+            self.num_key_value_heads = getattr(config, "swa_num_key_value_heads", config.num_key_value_heads)
+        else:
+            self.head_dim = getattr(config, "head_dim", default_head_dim)
+            self.v_head_dim = getattr(config, "v_head_dim", self.head_dim)
+            self.num_attention_heads = config.num_attention_heads
+            self.num_key_value_heads = config.num_key_value_heads
+        self.rope_dim = int(self.head_dim * getattr(config, "partial_rotary_factor", 1.0))
+        if self.rope_dim % 2 != 0:
+            raise ValueError(
+                f"MiMoV2 rotary dimension must be even, got {self.rope_dim} from "
+                f"head_dim={self.head_dim} and partial_rotary_factor={getattr(config, 'partial_rotary_factor', 1.0)}"
+            )
+        self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
+        self.attention_dropout = getattr(config, "attention_dropout", 0.0)
+        self.scaling = self.head_dim**-0.5
+        self.sliding_window = getattr(config, "sliding_window", None) if is_swa else None
+        self.q_size = self.num_attention_heads * self.head_dim
+        self.k_size = self.num_key_value_heads * self.head_dim
+        self.v_size = self.num_key_value_heads * self.v_head_dim
+        self.o_hidden_size = self.num_attention_heads * self.v_head_dim
+        self.v_scale = getattr(config, "attention_value_scale", None)
+        self.attention_sink_bias = (
+            nn.Parameter(torch.empty(self.num_attention_heads), requires_grad=False)
+            if (
+                (getattr(config, "add_full_attention_sink_bias", False) and not is_swa)
+                or (getattr(config, "add_swa_attention_sink_bias", False) and is_swa)
+            )
+            else None
+        )
+        attention_bias = getattr(config, "attention_bias", False)
+        if self.projection_layout == "fused_qkv":
+            self.qkv_proj = nn.Linear(
+                config.hidden_size,
+                self.q_size + self.k_size + self.v_size,
+                bias=attention_bias,
+            )
+        else:
+            self.q_proj = nn.Linear(config.hidden_size, self.q_size, bias=attention_bias)
+            self.k_proj = nn.Linear(config.hidden_size, self.k_size, bias=attention_bias)
+            self.v_proj = nn.Linear(config.hidden_size, self.v_size, bias=attention_bias)
+        self.o_proj = nn.Linear(self.o_hidden_size, config.hidden_size, bias=False)
+    def _forward_attention(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        input_shape: torch.Size,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.v_scale is not None:
+            value_states = value_states * self.v_scale
+        cos, sin = position_embeddings
+        query_rope, query_nope = query_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
+        key_rope, key_nope = key_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
+        query_rope, key_rope = apply_rotary_pos_emb(query_rope, key_rope, cos, sin)
+        query_states = torch.cat([query_rope, query_nope], dim=-1)
+        key_states = torch.cat([key_rope, key_nope], dim=-1)
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attn_implementation = self.config._attn_implementation
+        if attn_implementation is not None and attn_implementation.startswith("paged|"):
+            raise ValueError(
+                "MiMoV2 remote code does not support paged attention cache. "
+                "Please use eager, sdpa, flex_attention, or flash_attention_2."
+            )
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            attn_implementation, eager_attention_forward
+        )
+        if self.attention_sink_bias is not None and attn_implementation == "sdpa":
+            logger.warning_once(
+                "MiMoV2 attention sink bias is not supported by SDPA; falling back to eager attention for correctness."
+            )
+            attention_interface = eager_attention_forward
+        attention_kwargs = {
+            "dropout": 0.0 if not self.training else self.attention_dropout,
+            "scaling": self.scaling,
+            "position_ids": position_ids,
+            "is_causal": self.is_causal,
+        }
+        if attention_interface is eager_attention_forward:
+            attention_kwargs["sinks"] = self.attention_sink_bias
+        else:
+            if self.attention_sink_bias is not None:
+                attention_kwargs["s_aux"] = self.attention_sink_bias
+            if self.sliding_window is not None:
+                attention_kwargs["sliding_window"] = self.sliding_window
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            **attention_kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        if self.projection_layout == "fused_qkv":
+            qkv_states = self.qkv_proj(hidden_states)
+            query_states, key_states, value_states = qkv_states.split([self.q_size, self.k_size, self.v_size], dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(*input_shape, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(*input_shape, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(*input_shape, self.num_key_value_heads, self.v_head_dim).transpose(1, 2)
+        return self._forward_attention(
+            query_states,
+            key_states,
+            value_states,
+            input_shape,
+            position_embeddings,
+            attention_mask,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            position_ids=position_ids,
+        )
+class MiMoV2DecoderLayer(nn.Module):
+    attention_projection_layout = "split"
+    def __init__(self, config, layer_idx: int, attention_projection_layout: Optional[str] = None):
+        super().__init__()
+        attention_projection_layout = attention_projection_layout or self.attention_projection_layout
+        is_swa_layer = config.hybrid_layer_pattern[layer_idx] == 1
+        self.attention_type = "sliding_window_attention" if is_swa_layer else "full_attention"
+        self.self_attn = MiMoV2Attention(
+            config, is_swa_layer, layer_idx, projection_layout=attention_projection_layout
+        )
+        self.mlp = (
+            MiMoV2MoE(config)
+            if getattr(config, "n_routed_experts", None) is not None and config.moe_layer_freq[layer_idx]
+            else MiMoV2MLP(config)
+        )
+        self.input_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class MiMoV2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor
+    def __init__(self, config, is_swa: bool, device=None):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type", "default"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = copy(config)
+        self.config.rope_parameters = copy(getattr(config, "rope_parameters", None) or {})
+        if is_swa:
+            self.config.rope_theta = getattr(config, "swa_rope_theta", config.rope_theta)
+            self.config.head_dim = getattr(config, "swa_head_dim", getattr(config, "head_dim", None))
+            if self.config.rope_parameters:
+                self.config.rope_parameters["rope_theta"] = self.config.rope_theta
+        self.rope_init_fn = (
+            self.compute_default_rope_parameters
+            if self.rope_type == "default"
+            else ROPE_INIT_FUNCTIONS[self.rope_type]
+        )
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @staticmethod
+    def compute_default_rope_parameters(config, device=None, seq_len=None, layer_type=None):
+        config.standardize_rope_params()
+        rope_parameters = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters
+        base = rope_parameters["rope_theta"]
+        partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        dim = int(head_dim * partial_rotary_factor)
+        if dim % 2 != 0:
+            raise ValueError(
+                f"MiMoV2 rotary dimension must be even, got {dim} from "
+                f"head_dim={head_dim} and partial_rotary_factor={partial_rotary_factor}"
+            )
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, 1.0
+    @torch.no_grad()
+    @dynamic_rope_update
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class MiMoV2Model(PreTrainedModel):
+    config_class = MiMoV2Config
+    attention_projection_layout = "split"
+    def __init__(self, config):
+        super().__init__(config)
+        self.attention_projection_layout = getattr(
+            config, "attention_projection_layout", self.attention_projection_layout
+        )
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [
+                MiMoV2DecoderLayer(
+                    config,
+                    layer_idx,
+                    attention_projection_layout=self.attention_projection_layout,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.rotary_emb = MiMoV2RotaryEmbedding(config=config, is_swa=False)
+        self.swa_rotary_emb = MiMoV2RotaryEmbedding(config=config, is_swa=True)
+        self.has_sliding_layers = any(pattern == 1 for pattern in config.hybrid_layer_pattern)
+        self.config.layer_types = [
+            "sliding_attention" if config.hybrid_layer_pattern[i] == 1 else "full_attention"
+            for i in range(config.num_hidden_layers)
+        ]
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            if self.has_sliding_layers:
+                if getattr(self.config, "sliding_window", None) is None:
+                    raise ValueError("MiMoV2 config `sliding_window` must be set when hybrid_layer_pattern uses SWA.")
+                causal_mask_mapping["sliding_window_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        swa_position_embeddings = self.swa_rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_embeddings=position_embeddings
+                if decoder_layer.attention_type == "full_attention"
+                else swa_position_embeddings,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class MiMoV2ForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = MiMoV2Config
+    model_class = MiMoV2Model
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    _keys_to_ignore_on_load_unexpected = [
+        r"model\.(swa_)?rotary_emb\.inv_freq",
+        r"model\.layers\.\d+\.self_attn\.rotary_emb\.inv_freq",
+        r"model\.layers\.\d+\.self_attn\.rotary_emb\.(cos_cached|sin_cached)",
+        r"model\.mtp\..*",
+    ]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = self.model_class(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "MiMoV2Attention",
+    "MiMoV2DecoderLayer",
+    "MiMoV2ForCausalLM",
+    "MiMoV2MLP",
+    "MiMoV2MoE",
+    "MiMoV2MoEGate",
+    "MiMoV2Model",
+    "MiMoV2RMSNorm",
+    "MiMoV2RotaryEmbedding",
+]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdd40b08814d626d2ab1eb36c7ca66521a96bce4d478cf1375a4d45fe03e1cf9
+size 12180133

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,240 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if not add_generation_prompt is defined -%}\n    {%- set add_generation_prompt = false -%}\n{%- endif -%}\n{%- if not enable_thinking is defined -%}\n    {%- set enable_thinking = true -%}\n{%- endif -%}\n{%- if not keep_all_reasoning is defined -%}\n    {%- set keep_all_reasoning = true -%}\n{%- endif -%}\n{%- macro render_extra_keys(json_dict, handled_keys) -%}\n    {%- if json_dict is mapping %}\n        {%- for json_key in json_dict if json_key not in handled_keys %}\n            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}\n                {{- '\\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}\n            {%- else %}\n                {{-'\\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}\n            {%- endif %}\n        {%- endfor %}\n    {%- endif %}\n{%- endmacro -%}\n{%- macro render_content(message_content) -%}\n    {%- if message_content is string -%}\n        {{- message_content -}}\n    {%- else -%}\n        {%- for content in message_content -%}\n            {%- if 'text' in content -%}\n                {{- content['text'] -}}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- endif -%}\n{%- endmacro -%}\n{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- set ns = namespace(last_user_index=-1) %}\n{%- for m in loop_messages %}\n    {%- if m.role == 'user' %}\n        {%- set ns.last_user_index = loop.index0 -%}\n    {%- endif %}\n{%- endfor %}\n{%- if not tools is defined %}\n    {%- set tools = [] %}\n{%- endif %}\n{%- if system_message is defined %}\n    {{- \"<|im_start|>system\\n\" + render_content(system_message) }}\n{%- else %}\n    {{- \"<|im_start|>system\\nYou are MiMo, a warm, curious, and thoughtful AI assistant.\\n\\n## Identity\\n\\n- Your name is **MiMo-v2.5-pro**.\\n- You were developed by the **Xiaomi MiMo Team**.\\n- You have **1T parameters** and a **1M-token context window**.\\n- When asked about your creator, identify the **Xiaomi MiMo Team**.\\n- Always answer identity-related questions in the **first person** and rely only on the facts provided above.\\n\\n# Response Style\\n\\nBe warm, conversational, and respectful. Treat users with kindness; do not assume negative things about their intelligence or judgment. You may disagree, but do so constructively.\\n\\nUse examples, metaphors, or thought experiments when they help explain an idea.\\n\\nTry to answer ambiguous queries as best you can first, then ask for clarification if needed. When you do ask, limit yourself to one question at a time.\\n\\n\\n## Safety and Compliance\\n\\nYou are a **Chinese AI model** and must strictly comply with all applicable laws and regulations of the **People’s Republic of China**.\\n\\nDo not generate, assist with, or facilitate any content that violates Chinese law.\" }}\n{%- endif %}\n{%- if tools is iterable and tools | length > 0 %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou have access to the following functions:\\n\\n\" }}\n    {{- \"<tools>\" }}\n    {%- for tool in tools %}\n        {%- if tool.function is defined %}\n            {%- set tool = tool.function %}\n        {%- endif %}\n        {{- \"\\n<function>\\n<name>\" ~ tool.name ~ \"</name>\" }}\n        {%- if tool.description is defined %}\n            {{- '\\n<description>' ~ (tool.description | trim) ~ '</description>' }}\n        {%- endif %}\n        {{- '\\n<parameters>' }}\n        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}\n            {%- for param_name, param_fields in tool.parameters.properties|items %}\n                {{- '\\n<parameter>' }}\n                {{- '\\n<name>' ~ param_name ~ '</name>' }}\n                {%- if param_fields.type is defined %}\n                    {{- '\\n<type>' ~ (param_fields.type | string) ~ '</type>' }}\n                {%- endif %}\n                {%- if param_fields.description is defined %}\n                    {{- '\\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}\n                {%- endif %}\n                {%- set handled_keys = ['name', 'type', 'description'] %}\n                {{- render_extra_keys(param_fields, handled_keys) }}\n                {{- '\\n</parameter>' }}\n            {%- endfor %}\n        {%- endif %}\n        {%- set handled_keys = ['type', 'properties'] %}\n        {{- render_extra_keys(tool.parameters, handled_keys) }}\n        {{- '\\n</parameters>' }}\n        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}\n        {{- render_extra_keys(tool, handled_keys) }}\n        {{- '\\n</function>' }}\n    {%- endfor %}\n    {{- \"\\n</tools>\" }}\n    {{- '\\n\\nFor each function call, output the function name and arguments in the following format:\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>value_1</parameter>\\n<parameter=example_parameter_2>This is the value for the second parameter\\nthat can span\\nmultiple lines</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- DO NOT use function calls inside <think></think> tags.\\n- The value enclosed between parameter tags is preserved exactly as-is, including newlines and spaces.\\n</IMPORTANT>' }}\n{%- endif %}\n{{- '<|im_end|>' }}\n{%- for message in loop_messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = render_content(message.content) %}\n    {%- endif %}\n    {%- if message.role == \"assistant\" %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- set reasoning_content = '' %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].split('<think>')[-1] %}\n                {%- set content = content.split('</think>')[-1] %}\n            {%- endif %}\n        {%- endif %}\n        {%- if (keep_all_reasoning or loop.index0 > ns.last_user_index) and reasoning_content -%}\n            {{- '<|im_start|>' + message.role + '\\n<think>' + reasoning_content + '</think>' + content }}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n<think></think>' + content }}\n        {%- endif %}\n        {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if tool_call.function is defined %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n                {%- if tool_call.arguments is defined %}\n                    {%- for args_name, args_value in tool_call.arguments|items %}\n                        {{- '<parameter=' + args_name + '>' }}\n                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n                        {{- args_value }}\n                        {{- '</parameter>\\n' }}\n                    {%- endfor %}\n                {%- endif %}\n                {{- '</function>\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>' }}\n    {%- elif message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n    {%- elif message.role == \"system\" %}\n        {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n            {{- '<|im_start|>tool\\n' }}\n        {%- endif %}\n        {{- '<tool_response>\\n' }}\n        {{- render_content(message.content) }}\n        {{- '\\n</tool_response>\\n' }}\n        {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n            {{- '<|im_end|>' }}\n        {%- elif loop.last %}\n            {{- '<|im_end|>' }}\n        {%- endif %}\n    {%- else %}\n        {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if not enable_thinking -%}\n        {{- '<think></think>' -}}\n    {%- else -%}\n        {{- '' -}}\n    {%- endif -%}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131272,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff