invinciblejha01 bwshen-mi commited on
Commit
8715df4
·
0 Parent(s):

Duplicate from XiaomiMiMo/MiMo-V2.5-Pro

Browse files

Co-authored-by: Bowen Shen <bwshen-mi@users.noreply.huggingface.co>

Files changed (49) hide show
  1. .gitattributes +40 -0
  2. README.md +218 -0
  3. added_tokens.json +28 -0
  4. assets/architecture.png +3 -0
  5. assets/benchmark.jpg +3 -0
  6. assets/post_training_evaluation.jpg +3 -0
  7. config.json +156 -0
  8. configuration_mimo_v2.py +209 -0
  9. merges.txt +0 -0
  10. model.safetensors.index.json +3 -0
  11. model_mtp.safetensors +3 -0
  12. model_pp0_ep0_shard0.safetensors +3 -0
  13. model_pp0_ep0_shard1.safetensors +3 -0
  14. model_pp0_ep10_shard0.safetensors +3 -0
  15. model_pp0_ep11_shard0.safetensors +3 -0
  16. model_pp0_ep12_shard0.safetensors +3 -0
  17. model_pp0_ep13_shard0.safetensors +3 -0
  18. model_pp0_ep14_shard0.safetensors +3 -0
  19. model_pp0_ep15_shard0.safetensors +3 -0
  20. model_pp0_ep16_shard0.safetensors +3 -0
  21. model_pp0_ep17_shard0.safetensors +3 -0
  22. model_pp0_ep18_shard0.safetensors +3 -0
  23. model_pp0_ep19_shard0.safetensors +3 -0
  24. model_pp0_ep1_shard0.safetensors +3 -0
  25. model_pp0_ep20_shard0.safetensors +3 -0
  26. model_pp0_ep21_shard0.safetensors +3 -0
  27. model_pp0_ep22_shard0.safetensors +3 -0
  28. model_pp0_ep23_shard0.safetensors +3 -0
  29. model_pp0_ep24_shard0.safetensors +3 -0
  30. model_pp0_ep25_shard0.safetensors +3 -0
  31. model_pp0_ep26_shard0.safetensors +3 -0
  32. model_pp0_ep27_shard0.safetensors +3 -0
  33. model_pp0_ep28_shard0.safetensors +3 -0
  34. model_pp0_ep29_shard0.safetensors +3 -0
  35. model_pp0_ep2_shard0.safetensors +3 -0
  36. model_pp0_ep30_shard0.safetensors +3 -0
  37. model_pp0_ep31_shard0.safetensors +3 -0
  38. model_pp0_ep3_shard0.safetensors +3 -0
  39. model_pp0_ep4_shard0.safetensors +3 -0
  40. model_pp0_ep5_shard0.safetensors +3 -0
  41. model_pp0_ep6_shard0.safetensors +3 -0
  42. model_pp0_ep7_shard0.safetensors +3 -0
  43. model_pp0_ep8_shard0.safetensors +3 -0
  44. model_pp0_ep9_shard0.safetensors +3 -0
  45. modeling_mimo_v2.py +697 -0
  46. special_tokens_map.json +31 -0
  47. tokenizer.json +3 -0
  48. tokenizer_config.json +240 -0
  49. vocab.json +0 -0
.gitattributes ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/architecture.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/benchmark.jpg filter=lfs diff=lfs merge=lfs -text
38
+ assets/post_training_evaluation.jpg filter=lfs diff=lfs merge=lfs -text
39
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
40
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ - zh
6
+ tags:
7
+ - text-generation
8
+ - agent
9
+ - long-context
10
+ - code
11
+ ---
12
+
13
+ <br/><br/>
14
+
15
+ <div align="center">
16
+ <picture>
17
+ <source srcset="https://github.com/XiaomiMiMo/MiMo/raw/main/figures/Xiaomi_MiMo_darkmode.png?raw=true" media="(prefers-color-scheme: dark)">
18
+ <img src="https://github.com/XiaomiMiMo/MiMo/raw/main/figures/Xiaomi_MiMo.png?raw=true" width="60%" alt="Xiaomi-MiMo" />
19
+ </picture>
20
+ </div>
21
+
22
+ <br/>
23
+
24
+ <div align="center" style="line-height: 1;">
25
+ |
26
+ <a href="https://huggingface.co/XiaomiMiMo" target="_blank">🤗 HuggingFace</a>
27
+ &nbsp;|
28
+ <a href="https://mimo.xiaomi.com/mimo-v2-5-pro" target="_blank">📰 Blog </a>
29
+ &nbsp;|
30
+ <a href="https://platform.xiaomimimo.com/" target="_blank">🎨 Xiaomi MiMo API Platform </a>
31
+ &nbsp;|
32
+ <a href="https://aistudio.xiaomimimo.com" target="_blank">🗨️ Xiaomi MiMo Studio </a>
33
+ &nbsp;|
34
+ </div>
35
+
36
+ <br/>
37
+
38
+ <div align="center" style="line-height: 1.2;">
39
+ <strong>Community</strong><br/>
40
+ <a href="https://work.weixin.qq.com/apph5/external_room/join/group_mng?plg_id=c417f99bd9014b5dd894daa8bfe19790&" target="_blank">WeChat Group</a>
41
+ &nbsp;|&nbsp;
42
+ <a href="https://discord.gg/WX2R2uNp" target="_blank">Discord</a>
43
+ &nbsp;|&nbsp;
44
+ <a href="https://t.me/+3T-I0pekOVIyNDBl" target="_blank">Telegram</a>
45
+ &nbsp;|&nbsp;
46
+ <a href="https://www.reddit.com/r/XiaomiMiMo_Official/" target="_blank">Reddit</a>
47
+ </div>
48
+
49
+ <br/>
50
+
51
+ # MiMo-V2.5-Pro
52
+
53
+ MiMo-V2.5-Pro is an open-source Mixture-of-Experts (MoE) language model with 1.02T total parameters and 42B active parameters. It utilizes the hybrid attention architecture and 3-layers Multi-Token Prediction (MTP) introduced in [MiMo-V2-Flash](https://github.com/XiaomiMiMo/MiMo-V2-Flash), with up to 1M tokens context length.
54
+
55
+ <div align="center">
56
+ <img src="assets/benchmark.jpg" width="90%" alt="Benchmark Results" />
57
+ </div>
58
+
59
+ ## 1. Introduction
60
+
61
+ MiMo-V2.5-Pro is our most capable model to date, designed for the most demanding agentic, complex software engineering, and long-horizon tasks. It sustains complex trajectories spanning thousands of tool calls with strong instruction following and coherence over a 1M-token context window. Key features include:
62
+
63
+ - **Hybrid Attention Architecture**: Interleaves Sliding Window Attention (SWA) and Global Attention (GA) with a 6:1 ratio and 128 sliding window. This reduces KV-cache storage by nearly 7x while maintaining long-context performance via learnable attention sink bias.
64
+ - **Multi-Token Prediction (MTP)**: Equipped with three lightweight MTP modules using dense FFNs. This triples output speed during inference and will be good to accelerate rollout in RL training.
65
+ - **Efficient Pre-Training**: Trained on 27T tokens using FP8 mixed precision and native 32k seq length. The context window supports up to 1M tokens.
66
+ - **Agentic Capabilities**: Post-training utilizes SFT, large-scale agentic RL and Multi-Teacher On-Policy Distillation (MOPD), achieving superior performance on the most demanding agentic, complex software engineering, and long-horizon tasks.
67
+
68
+ ## 2. Model Downloads
69
+
70
+ | Model | Total Params | Active Params | Context Length | Precision | Download |
71
+ | :--- | :---: | :---: | :---: | :---: | :---: |
72
+ | **MiMo-V2.5-Pro** | 1.02T | 42B | 1M | FP8 (E4M3) Mixed | [🤗 HuggingFace](https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro) <br> [🤖 ModelScope](https://modelscope.cn/models/XiaomiMiMo/MiMo-V2.5-Pro) |
73
+ | **MiMo-V2.5-Pro-Base** | 1.02T | 42B | 256K | FP8 (E4M3) Mixed | [🤗 HuggingFace](https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro-Base) <br> [🤖 ModelScope](https://modelscope.cn/models/XiaomiMiMo/MiMo-V2.5-Pro-Base) |
74
+
75
+ ## 3. Evaluation Results
76
+
77
+ ### Base Model Evaluation
78
+
79
+ | Category | Benchmark | Setting | MiMo-V2.5-Pro Base | MiMo-V2.5 Base | DeepSeek-V4-Pro Base | DeepSeek-V4-Flash Base | Kimi-K2 Base |
80
+ | :--- | :--- | :---: | :---: | :---: | :---: | :---: | :---: |
81
+ | **Params** | #Activated / #Total | - | 42B / 1.02T | 15B / 310B | 49B / 1.6T | 13B / 284B | 32B / 1.04T |
82
+ | **General** | BBH | 3-shot | 88.4 | 87.2 | 87.5 | 86.9 | 88.7 |
83
+ | | MMLU | 5-shot | 89.4 | 86.3 | 90.1 | 88.7 | 87.8 |
84
+ | | MMLU-Redux | 5-shot | 92.8 | 89.8 | 90.8 | 89.4 | 90.2 |
85
+ | | MMLU-Pro | 5-shot | 68.5 | 65.8 | 73.5 | 68.3 | 69.2 |
86
+ | | DROP | 3-shot | 86.3 | 83.7 | 88.7 | 88.6 | 83.6 |
87
+ | | ARC-Challenge | 25-shot | 97.2 | 96.5 | - | - | 96.2 |
88
+ | | HellaSwag | 10-shot | 89.8 | 88.6 | 88.0 | 85.7 | 94.6 |
89
+ | | WinoGrande | 5-shot | 85.6 | 84.7 | 81.5 | 79.5 | 85.3 |
90
+ | | TriviaQA | 5-shot | 81.3 | 80.7 | 85.6 | 82.8 | 85.1 |
91
+ | | GPQA-Diamond | 5-shot | 66.7 | 58.1 | - | - | 48.1 |
92
+ | **Math** | GSM8K | 8-shot | 99.6 | 83.3 | 92.6 | 90.8 | 92.1 |
93
+ | | MATH | 4-shot | 86.2 | 67.7 | 64.5 | 57.4 | 70.2 |
94
+ | | AIME 24&25 | 2-shot | 37.3 | 36.9 | - | - | 31.6 |
95
+ | **Code** | HumanEval+ | 1-shot | 75.6 | 71.3 | - | - | 84.8 |
96
+ | | MBPP+ | 3-shot | 74.1 | 70.9 | - | - | 73.8 |
97
+ | | LiveCodeBench v6 | 1-shot | 39.6 | 35.5 | - | - | 26.3 |
98
+ | | SWE-Bench (AgentLess) | 3-shot | 35.7 | 30.8 | - | - | 28.2 |
99
+ | **Chinese** | C-Eval | 5-shot | 91.5 | 88.6 | 93.1 | 92.1 | 92.5 |
100
+ | | CMMLU | 5-shot | 90.2 | 88.2 | 90.8 | 90.4 | 90.9 |
101
+ | **Multilingual** | GlobalMMLU | 5-shot | 83.6 | 77.4 | - | - | 80.7 |
102
+
103
+ ### Long-context Evaluation
104
+
105
+ <div align="center">
106
+ <img src="assets/post_training_evaluation.jpg" width="80%" alt="Post-training Evaluation" />
107
+ </div>
108
+
109
+ GraphWalks is a long-context benchmark from OpenAI that fills the prompt with a directed graph of hex-hash nodes and asks the model to run a breadth-first search (nodes exactly at depth *N*) or list a node's parents. We evaluate across the full 32k–1M input-token span and apply the same evaluation fixes described by Anthropic.
110
+
111
+ MiMo V2.5 Pro delivers a major leap in long-context reasoning. Past 128k, V2 Pro degrades rapidly and collapses to 0.00 at 1M on both subtasks, while V2.5 Pro still scores 0.56 BFS / 0.92 Parents at 512k and 0.37 / 0.62 at 1M.
112
+
113
+ ## 4. Model Architecture & Training Process
114
+
115
+ MiMo-V2.5-Pro addresses the quadratic complexity of long contexts by interleaving Local Sliding Window Attention (SWA) and Global Attention (GA). Unlike traditional speculative decoding, our MTP module is natively integrated for training and inference.
116
+
117
+ <div align="center">
118
+ <img src="assets/architecture.png" width="60%" alt="Model Architecture" />
119
+ </div>
120
+
121
+ ### Model Summary
122
+
123
+ | Component | MiMo-V2.5-Pro | MiMo-V2.5 |
124
+ | :--- | :---: | :---: |
125
+ | **Total Parameters** | 1.02T | 310B |
126
+ | **Activated Parameters** | 42B | 15B |
127
+ | **Hidden Size** | 6144 | 4096 |
128
+ | **Num Layers** | 70 (1 dense + 69 MoE) | 48 (1 dense + 47 MoE)|
129
+ | **Full Attention Layers** | 10 | 9 |
130
+ | **SWA Layers** | 60 | 39 |
131
+ | **Num Attention Heads** | 128 | 64 |
132
+ | **Num KV Heads** | 8 (GQA) | 8 (GA) / 4 (SWA) |
133
+ | **Head Dim (QK / V)** | 192 / 128 | 192 / 128 |
134
+ | **Routed Experts** | 384 | 256 |
135
+ | **Experts per Token** | 8 | 8 |
136
+ | **MoE Intermediate Size** | 2048 | 2048 |
137
+ | **Dense Intermediate Size** | 16384 (layer 0 only) | 16384 (layer 0 only) |
138
+ | **SWA Window Size** | 128 | 128 |
139
+ | **Max Context Length** | 1M | 1M |
140
+ | **MTP Layers** | 3 | 3 |
141
+
142
+ ### Training Process
143
+
144
+ For post-training, MiMo-V2.5-Pro adopts the three-stage post-training paradigm introduced in [MiMo-V2-Flash](https://github.com/XiaomiMiMo/MiMo-V2-Flash) to achieve exceptional performance. The paradigm begins with Supervised Fine-Tuning (SFT) to build strong, foundational instruction-following skills using curated data pairs. Next, in the Domain-Specialized Training stage, diverse teacher models — ranging from math and safety to complex agentic tool-use — are individually optimized using domain-specific RL rewards. Finally, the process culminates in Multi-Teacher On-Policy Distillation (MOPD). Through dynamic on-policy RL, the single student model iteratively learns from its own outputs, continuously receiving precise token-level guidance from the expert teachers to seamlessly integrate broad capabilities.
145
+
146
+ ## 5. Deployment
147
+
148
+ Since inference engines are continuously being updated and optimized, this guide only provides deployment examples for reference. For the best performance, we strongly recommend following our referenced approach to get the latest best practices and optimal performance.
149
+
150
+ ### SGLang Deployment
151
+
152
+ For the best performance, we strongly recommend deploying using this approach, which is officially supported by the SGLang community. Please refer to [SGLang MiMo-V2.5-Pro Cookbook](https://docs.sglang.io/cookbook/autoregressive/Xiaomi/MiMo-V2.5) for the latest deployment guide.
153
+
154
+ The following is an example of running the model with SGLang, referenced from [sgl-project/sglang#23808](https://github.com/sgl-project/sglang/pull/23808):
155
+
156
+ ```bash
157
+ SGLANG_ENABLE_SPEC_V2=1
158
+ SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
159
+ python3 -m sglang.launch_server \
160
+ --model-path XiaomiMiMo/MiMo-V2.5-Pro \
161
+ --trust-remote-code \
162
+ --pp-size 1 \
163
+ --dp-size 2 \
164
+ --ep-size 16 \
165
+ --tp-size 16 \
166
+ --moe-dense-tp-size 1 \
167
+ --enable-dp-attention \
168
+ --moe-a2a-backend deepep \
169
+ --dist-init-addr ${LWS_LEADER_IP}:20000 \
170
+ --node-rank ${LWS_WORKER_INDEX} \
171
+ --nnodes ${LWS_GROUP_SIZE} \
172
+ --page-size 64 \
173
+ --attention-backend fa3 \
174
+ --quantization fp8 \
175
+ --mem-fraction-static 0.7 \
176
+ --max-running-requests 128 \
177
+ --cuda-graph-max-bs 64 \
178
+ --chunked-prefill-size 32768 \
179
+ --context-length 1048576 \
180
+ --tokenizer-worker-num 64 \
181
+ --speculative-algorithm EAGLE \
182
+ --speculative-num-steps 3 \
183
+ --speculative-eagle-topk 1 \
184
+ --speculative-num-draft-tokens 4 \
185
+ --enable-multi-layer-eagle \
186
+ --host 0.0.0.0 \
187
+ --port 9001 \
188
+ --reasoning-parser mimo \
189
+ --tool-call-parser mimo \
190
+ --watchdog-timeout 3600 \
191
+ --model-loader-extra-config '{"enable_multithread_load": "true","num_threads": 64}'
192
+ ```
193
+
194
+ ### vLLM Deployment
195
+
196
+ For the best performance, we strongly recommend deploying using this approach, which is officially supported by the vLLM community. Please refer to [vLLM MiMo-V2.5-Pro Cookbook](https://recipes.vllm.ai/XiaomiMiMo/MiMo-V2.5-Pro) for the latest deployment guide.
197
+
198
+ For local deployment, we recommend setting the sampling parameters to `temperature=1.0`, `top_p=0.95`.
199
+
200
+ ## Citation
201
+
202
+ ```bibtex
203
+ @misc{mimo2026v25pro,
204
+ title={MiMo-V2.5-Pro},
205
+ author={{Xiaomi MiMo Team}},
206
+ year={2026},
207
+ howpublished={\url{https://huggingface.co/collections/XiaomiMiMo/mimo-v25}},
208
+ }
209
+ ```
210
+
211
+ ## Contact
212
+
213
+ For questions or feedback, reach us at [mimo@xiaomi.com](mailto:mimo@xiaomi.com) or join our community:
214
+
215
+ - [WeChat Group](https://work.weixin.qq.com/apph5/external_room/join/group_mng?plg_id=c417f99bd9014b5dd894daa8bfe19790&)
216
+ - [Discord](https://discord.gg/WX2R2uNp)
217
+ - [Telegram](https://t.me/+3T-I0pekOVIyNDBl)
218
+ - [Reddit](https://www.reddit.com/r/XiaomiMiMo_Official/)
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
assets/architecture.png ADDED

Git LFS Details

  • SHA256: 364418932cbfeb4757e7f7bc91bd3df19de6fca21640c3c0bbda89a569358e0d
  • Pointer size: 131 Bytes
  • Size of remote file: 152 kB
assets/benchmark.jpg ADDED

Git LFS Details

  • SHA256: 5b4e4a2c797c9ab879249f88b5b2341322391f71c689941a70e075b66401a6c6
  • Pointer size: 131 Bytes
  • Size of remote file: 624 kB
assets/post_training_evaluation.jpg ADDED

Git LFS Details

  • SHA256: f8b9ea21cb0b35905395f4af80aa6952fce1367b388af82d67ab89233e2674a4
  • Pointer size: 131 Bytes
  • Size of remote file: 216 kB
config.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiMoV2ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_mimo_v2.MiMoV2Config",
7
+ "AutoModel": "modeling_mimo_v2.MiMoV2Model",
8
+ "AutoModelForCausalLM": "modeling_mimo_v2.MiMoV2ForCausalLM"
9
+ },
10
+ "quantization_config": {
11
+ "activation_scheme": "dynamic",
12
+ "fmt": "e4m3",
13
+ "quant_method": "fp8",
14
+ "weight_block_size": [
15
+ 128,
16
+ 128
17
+ ],
18
+ "ignored_layers": [
19
+ "model.layers.0.self_attn.o_proj",
20
+ "model.layers.1.self_attn.o_proj",
21
+ "model.layers.2.self_attn.o_proj",
22
+ "model.layers.3.self_attn.o_proj",
23
+ "model.layers.4.self_attn.o_proj",
24
+ "model.layers.5.self_attn.o_proj",
25
+ "model.layers.6.self_attn.o_proj",
26
+ "model.layers.7.self_attn.o_proj",
27
+ "model.layers.8.self_attn.o_proj",
28
+ "model.layers.9.self_attn.o_proj",
29
+ "model.layers.10.self_attn.o_proj",
30
+ "model.layers.11.self_attn.o_proj",
31
+ "model.layers.12.self_attn.o_proj",
32
+ "model.layers.13.self_attn.o_proj",
33
+ "model.layers.14.self_attn.o_proj",
34
+ "model.layers.15.self_attn.o_proj",
35
+ "model.layers.16.self_attn.o_proj",
36
+ "model.layers.17.self_attn.o_proj",
37
+ "model.layers.18.self_attn.o_proj",
38
+ "model.layers.19.self_attn.o_proj",
39
+ "model.layers.20.self_attn.o_proj",
40
+ "model.layers.21.self_attn.o_proj",
41
+ "model.layers.22.self_attn.o_proj",
42
+ "model.layers.23.self_attn.o_proj",
43
+ "model.layers.24.self_attn.o_proj",
44
+ "model.layers.25.self_attn.o_proj",
45
+ "model.layers.26.self_attn.o_proj",
46
+ "model.layers.27.self_attn.o_proj",
47
+ "model.layers.28.self_attn.o_proj",
48
+ "model.layers.29.self_attn.o_proj",
49
+ "model.layers.30.self_attn.o_proj",
50
+ "model.layers.31.self_attn.o_proj",
51
+ "model.layers.32.self_attn.o_proj",
52
+ "model.layers.33.self_attn.o_proj",
53
+ "model.layers.34.self_attn.o_proj",
54
+ "model.layers.35.self_attn.o_proj",
55
+ "model.layers.36.self_attn.o_proj",
56
+ "model.layers.37.self_attn.o_proj",
57
+ "model.layers.38.self_attn.o_proj",
58
+ "model.layers.39.self_attn.o_proj",
59
+ "model.layers.40.self_attn.o_proj",
60
+ "model.layers.41.self_attn.o_proj",
61
+ "model.layers.42.self_attn.o_proj",
62
+ "model.layers.43.self_attn.o_proj",
63
+ "model.layers.44.self_attn.o_proj",
64
+ "model.layers.45.self_attn.o_proj",
65
+ "model.layers.46.self_attn.o_proj",
66
+ "model.layers.47.self_attn.o_proj",
67
+ "model.layers.48.self_attn.o_proj",
68
+ "model.layers.49.self_attn.o_proj",
69
+ "model.layers.50.self_attn.o_proj",
70
+ "model.layers.51.self_attn.o_proj",
71
+ "model.layers.52.self_attn.o_proj",
72
+ "model.layers.53.self_attn.o_proj",
73
+ "model.layers.54.self_attn.o_proj",
74
+ "model.layers.55.self_attn.o_proj",
75
+ "model.layers.56.self_attn.o_proj",
76
+ "model.layers.57.self_attn.o_proj",
77
+ "model.layers.58.self_attn.o_proj",
78
+ "model.layers.59.self_attn.o_proj",
79
+ "model.layers.60.self_attn.o_proj",
80
+ "model.layers.61.self_attn.o_proj",
81
+ "model.layers.62.self_attn.o_proj",
82
+ "model.layers.63.self_attn.o_proj",
83
+ "model.layers.64.self_attn.o_proj",
84
+ "model.layers.65.self_attn.o_proj",
85
+ "model.layers.66.self_attn.o_proj",
86
+ "model.layers.67.self_attn.o_proj",
87
+ "model.layers.68.self_attn.o_proj",
88
+ "model.layers.69.self_attn.o_proj",
89
+ "model.decoder.self_attn.o_proj"
90
+ ]
91
+ },
92
+ "add_full_attention_sink_bias": false,
93
+ "add_swa_attention_sink_bias": true,
94
+ "attention_bias": false,
95
+ "attention_chunk_size": 128,
96
+ "attention_dropout": 0.0,
97
+ "attention_projection_layout": "fused_qkv",
98
+ "attention_value_scale": 0.612,
99
+ "head_dim": 192,
100
+ "hidden_act": "silu",
101
+ "hidden_size": 6144,
102
+ "hybrid_layer_pattern": [
103
+ 0, 1, 1, 1, 1, 1, 1,
104
+ 0, 1, 1, 1, 1, 1, 1, 1,
105
+ 0, 1, 1, 1, 1, 1, 1, 1,
106
+ 0, 1, 1, 1, 1, 1, 1, 1,
107
+ 0, 1, 1, 1, 1, 1, 1, 1,
108
+ 0, 1, 1, 1, 1, 1, 1, 1,
109
+ 0, 1, 1, 1, 1, 1, 1, 1,
110
+ 0, 1, 1, 1, 1, 1, 1,
111
+ 0, 1, 1, 1, 1, 1, 1,
112
+ 0
113
+ ],
114
+ "initializer_range": 0.02,
115
+ "intermediate_size": 16384,
116
+ "layernorm_epsilon": 1e-05,
117
+ "max_position_embeddings": 1048576,
118
+ "model_type": "mimo_v2",
119
+ "moe_intermediate_size": 2048,
120
+ "moe_layer_freq": [
121
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
128
+ ],
129
+ "n_group": 1,
130
+ "n_routed_experts": 384,
131
+ "n_shared_experts": null,
132
+ "norm_topk_prob": true,
133
+ "num_attention_heads": 128,
134
+ "num_experts_per_tok": 8,
135
+ "num_hidden_layers": 70,
136
+ "num_key_value_heads": 8,
137
+ "partial_rotary_factor": 0.334,
138
+ "rope_theta": 10000000,
139
+ "routed_scaling_factor": null,
140
+ "scoring_func": "sigmoid",
141
+ "sliding_window": 128,
142
+ "sliding_window_size": 128,
143
+ "swa_head_dim": 192,
144
+ "swa_num_attention_heads": 128,
145
+ "swa_num_key_value_heads": 8,
146
+ "swa_rope_theta": 10000,
147
+ "swa_v_head_dim": 128,
148
+ "tie_word_embeddings": false,
149
+ "topk_group": 1,
150
+ "topk_method": "noaux_tc",
151
+ "torch_dtype": "bfloat16",
152
+ "transformers_version": "4.57.1",
153
+ "use_cache": true,
154
+ "v_head_dim": 128,
155
+ "vocab_size": 152576
156
+ }
configuration_mimo_v2.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ #
3
+ # Copyright 2026 Xiaomi Corporation.
4
+ # Copyright 2026 The HuggingFace Inc. team.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.modeling_rope_utils import rope_config_validation
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ _MIMOV2_ATTENTION_PROJECTION_LAYOUTS = {"split", "fused_qkv"}
27
+
28
+ _MIMOV2_SPLIT_TP_PLAN = {
29
+ "layers.*.self_attn.q_proj": "colwise",
30
+ "layers.*.self_attn.k_proj": "colwise",
31
+ "layers.*.self_attn.v_proj": "colwise",
32
+ "layers.*.self_attn.o_proj": "rowwise",
33
+ "layers.*.mlp.gate_proj": "colwise",
34
+ "layers.*.mlp.up_proj": "colwise",
35
+ "layers.*.mlp.down_proj": "rowwise",
36
+ }
37
+
38
+ _MIMOV2_FUSED_QKV_TP_PLAN = {
39
+ "layers.*.self_attn.qkv_proj": "colwise",
40
+ "layers.*.self_attn.o_proj": "rowwise",
41
+ "layers.*.mlp.gate_proj": "colwise",
42
+ "layers.*.mlp.up_proj": "colwise",
43
+ "layers.*.mlp.down_proj": "rowwise",
44
+ }
45
+
46
+ _MIMOV2_PP_PLAN = {
47
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
48
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
49
+ "norm": (["hidden_states"], ["hidden_states"]),
50
+ }
51
+
52
+
53
+ class MiMoV2Config(PretrainedConfig):
54
+
55
+ model_type = "mimo_v2"
56
+ keys_to_ignore_at_inference = ["past_key_values"]
57
+
58
+ base_model_tp_plan = _MIMOV2_SPLIT_TP_PLAN
59
+ base_model_pp_plan = _MIMOV2_PP_PLAN
60
+
61
+ attribute_map = {
62
+ "num_local_experts": "n_routed_experts",
63
+ }
64
+
65
+ def __init__(
66
+ self,
67
+ vocab_size=151936,
68
+ hidden_size=4096,
69
+ intermediate_size=22016,
70
+ num_hidden_layers=32,
71
+ num_attention_heads=32,
72
+ num_key_value_heads=32,
73
+ hidden_act="silu",
74
+ max_position_embeddings=32768,
75
+ initializer_range=0.02,
76
+ layernorm_epsilon=1e-6,
77
+ use_cache=True,
78
+ tie_word_embeddings=False,
79
+ rope_theta=10000.0,
80
+ rope_scaling=None,
81
+ attention_dropout=0.0,
82
+ attention_bias=False,
83
+ attention_value_scale=None,
84
+ head_dim=None,
85
+ v_head_dim=None,
86
+ swa_num_attention_heads=None,
87
+ swa_num_key_value_heads=None,
88
+ swa_head_dim=None,
89
+ swa_v_head_dim=None,
90
+ swa_rope_theta=None,
91
+ sliding_window=None,
92
+ sliding_window_size=None,
93
+ add_full_attention_sink_bias=False,
94
+ add_swa_attention_sink_bias=False,
95
+ hybrid_block_size=None,
96
+ hybrid_layer_pattern=None,
97
+ partial_rotary_factor=1.0,
98
+ n_routed_experts=None,
99
+ moe_intermediate_size=None,
100
+ num_experts_per_tok=None,
101
+ routed_scaling_factor=None,
102
+ scoring_func="sigmoid",
103
+ topk_method="noaux_tc",
104
+ n_group=None,
105
+ topk_group=None,
106
+ norm_topk_prob=True,
107
+ moe_layer_freq=None,
108
+ attention_projection_layout="split",
109
+ **kwargs,
110
+ ):
111
+ rope_parameters = kwargs.pop("rope_parameters", None)
112
+ if rope_scaling is None and rope_parameters is not None:
113
+ rope_scaling = rope_parameters
114
+
115
+ if attention_projection_layout is None:
116
+ attention_projection_layout = "split"
117
+ if attention_projection_layout not in _MIMOV2_ATTENTION_PROJECTION_LAYOUTS:
118
+ raise ValueError(f"Unsupported MiMoV2 attention projection layout: {attention_projection_layout}")
119
+
120
+ self.attention_projection_layout = attention_projection_layout
121
+ self.base_model_tp_plan = (
122
+ _MIMOV2_FUSED_QKV_TP_PLAN.copy()
123
+ if attention_projection_layout == "fused_qkv"
124
+ else _MIMOV2_SPLIT_TP_PLAN.copy()
125
+ )
126
+ self.base_model_pp_plan = _MIMOV2_PP_PLAN.copy()
127
+
128
+ self.vocab_size = vocab_size
129
+ self.max_position_embeddings = max_position_embeddings
130
+ self.hidden_size = hidden_size
131
+ self.intermediate_size = intermediate_size
132
+ self.num_hidden_layers = num_hidden_layers
133
+ self.num_attention_heads = num_attention_heads
134
+
135
+ if num_key_value_heads is None:
136
+ num_key_value_heads = num_attention_heads
137
+ if num_attention_heads % num_key_value_heads != 0:
138
+ raise ValueError("num_attention_heads must be divisible by num_key_value_heads")
139
+
140
+ self.num_key_value_heads = num_key_value_heads
141
+ self.hidden_act = hidden_act
142
+ self.initializer_range = initializer_range
143
+ self.layernorm_epsilon = layernorm_epsilon
144
+ self.use_cache = use_cache
145
+ self.rope_theta = rope_theta
146
+ self.rope_scaling = rope_scaling
147
+ self.attention_dropout = attention_dropout
148
+ self.attention_bias = attention_bias
149
+ self.attention_value_scale = attention_value_scale
150
+
151
+ self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
152
+ self.v_head_dim = v_head_dim if v_head_dim is not None else self.head_dim
153
+ self.swa_num_attention_heads = (
154
+ swa_num_attention_heads if swa_num_attention_heads is not None else num_attention_heads
155
+ )
156
+ self.swa_num_key_value_heads = (
157
+ swa_num_key_value_heads if swa_num_key_value_heads is not None else num_key_value_heads
158
+ )
159
+ if self.swa_num_attention_heads % self.swa_num_key_value_heads != 0:
160
+ raise ValueError("swa_num_attention_heads must be divisible by swa_num_key_value_heads")
161
+ self.swa_head_dim = swa_head_dim if swa_head_dim is not None else self.head_dim
162
+ self.swa_v_head_dim = swa_v_head_dim if swa_v_head_dim is not None else self.swa_head_dim
163
+ self.swa_rope_theta = swa_rope_theta if swa_rope_theta is not None else rope_theta
164
+
165
+ if sliding_window is None:
166
+ sliding_window = sliding_window_size
167
+ self.sliding_window = sliding_window
168
+ self.sliding_window_size = sliding_window_size if sliding_window_size is not None else sliding_window
169
+ self.add_full_attention_sink_bias = add_full_attention_sink_bias
170
+ self.add_swa_attention_sink_bias = add_swa_attention_sink_bias
171
+
172
+ if hybrid_block_size is not None and hybrid_layer_pattern is None:
173
+ hybrid_layer_pattern = [0 if ((i + 1) % hybrid_block_size == 0) else 1 for i in range(num_hidden_layers)]
174
+ elif hybrid_layer_pattern is None:
175
+ hybrid_layer_pattern = [0] * num_hidden_layers
176
+ if len(hybrid_layer_pattern) != num_hidden_layers:
177
+ raise ValueError("hybrid_layer_pattern length must match num_hidden_layers")
178
+ self.hybrid_block_size = hybrid_block_size
179
+ self.hybrid_layer_pattern = hybrid_layer_pattern
180
+
181
+ self.partial_rotary_factor = partial_rotary_factor
182
+
183
+ self.n_routed_experts = n_routed_experts
184
+ self.moe_intermediate_size = moe_intermediate_size if moe_intermediate_size is not None else intermediate_size
185
+ self.num_experts_per_tok = num_experts_per_tok
186
+ self.routed_scaling_factor = routed_scaling_factor
187
+ self.scoring_func = scoring_func
188
+ self.topk_method = topk_method
189
+ self.n_group = n_group
190
+ self.topk_group = topk_group
191
+ self.norm_topk_prob = norm_topk_prob
192
+ if isinstance(moe_layer_freq, int):
193
+ moe_layer_freq = [moe_layer_freq > 0 and i % moe_layer_freq == 0 for i in range(num_hidden_layers)]
194
+ elif moe_layer_freq is None:
195
+ moe_layer_freq = [False] * num_hidden_layers
196
+ if len(moe_layer_freq) != num_hidden_layers:
197
+ raise ValueError("moe_layer_freq length must match num_hidden_layers")
198
+ self.moe_layer_freq = moe_layer_freq
199
+
200
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
201
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
202
+ rope_config_validation(self)
203
+
204
+ super().__init__(
205
+ tie_word_embeddings=tie_word_embeddings,
206
+ **kwargs,
207
+ )
208
+
209
+ __all__ = ["MiMoV2Config"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7ad2187407f8cf989f140a42d9bf340be7a220b2f6b35afa0a728774eb20f4b
3
+ size 15569001
model_mtp.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f1599b41996feab60b2b2ea42338b26979a2b58b115526bb49371a91e3c8d8
3
+ size 2463641280
model_pp0_ep0_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3aac22ecd430136c92626bbf1345dc9e99d9800a4cd49878793b52a6154cd40
3
+ size 34554911640
model_pp0_ep0_shard1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6422d714cb680c72ebc5982b95252f5457e7f4625ceb29fad5e05aecc10c3b57
3
+ size 27180576088
model_pp0_ep10_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf18b588ea10bead2879179f15bbca4e91e9abd59863581e2cd9004470d2d54d
3
+ size 31264218696
model_pp0_ep11_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a0b5d0ef3523bd3ce8fe2eab1bb226ce5122bf72fb30cf7bbe6e8f99459bb90
3
+ size 31264218696
model_pp0_ep12_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8f63229280884fc677945fee04391246c3d4d7aacf90f327ef430ab62bc72c6
3
+ size 31264218696
model_pp0_ep13_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93704b69e4f0e0fa148e2118f605518fddb62dbda17d4575918aa5f075dfb890
3
+ size 31264218696
model_pp0_ep14_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ebf87de52e82194428200f82e28c666bf42639c43d32b12218041ec67e4dccb
3
+ size 31264218696
model_pp0_ep15_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffbe58cf0bf67a61481ea7d5bb6f6dd310c546c5bed21ff45db264fbb5ffca46
3
+ size 31264218696
model_pp0_ep16_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d5028da852f2ae339e047deb0f4dbdcb5770ffbc411bc130947b40b6cc7c0f4
3
+ size 31264218696
model_pp0_ep17_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3ac44fff1a24c635d28ce4addc84589cb6638d6284f2e17a502b4b23dc0d9fa
3
+ size 31264218696
model_pp0_ep18_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:701682b0e073f471a47d3a61b1adb13d7e1183eacd053be2aafd6e33c2f1efb9
3
+ size 31264218696
model_pp0_ep19_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:653d5ed28d8320369708a145b209d0fdb6cb03673351a1cc92eff163dfe6757e
3
+ size 31264218696
model_pp0_ep1_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ae084dd12811777d1da10784235d5014a21a6b22fe936e70d0ab4682c336d7
3
+ size 31264213728
model_pp0_ep20_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ae1f90c36626cb3944ed4986f7df3181b761be5658e9cd561691dab946b1f28
3
+ size 31264218696
model_pp0_ep21_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25954cf458b4e149523d7f436ef404d51c6cc7d4fa6d2a12bfaa9e4fb7fb147c
3
+ size 31264218696
model_pp0_ep22_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb181bc5d65f77090b8423503c31895ed122d041bb6cf6d4a7b7846744d6be05
3
+ size 31264218696
model_pp0_ep23_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f67f2343e3b92066428bea0fb39eccc68d3f587ff1ad274478e3d369120fb61b
3
+ size 31264218696
model_pp0_ep24_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:588c6199211b4b947b53ce0fa93df4597a2da0090a4ca637e69e70428ef1780c
3
+ size 31264218696
model_pp0_ep25_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c845e96be226275597955cfe995ee41fa6b7eafd3a2e579595efadaca961c8f0
3
+ size 31264218696
model_pp0_ep26_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30f987f808058ba27447d8200e5c0d41df64a2d9525159f8ab9c5f635d69509a
3
+ size 31264218696
model_pp0_ep27_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b151aeb375b84ba967792207e8bd4e0c7ede979534f8648d3f5fa2bb88659ac
3
+ size 31264218696
model_pp0_ep28_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2538741e85b3430549f389be672d8cda5384f88f0f92af9d83442c37936ddd14
3
+ size 31264218696
model_pp0_ep29_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf56b7502b8aabd8d0779978e9ad03f4905715c2654cd9c1218571ea76f28a25
3
+ size 31264218696
model_pp0_ep2_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df6ddf8e6c3ced5d78f11a47c0f425b898cdb92acb31675bbb93de6aeeb52f6
3
+ size 31264213728
model_pp0_ep30_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e3aa954fa6729a12a773800fdfbd5a71dae18e30e4a4b7177d430173d541152
3
+ size 31264218696
model_pp0_ep31_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc4350c6f373209df4f49755ae06ede1a1c1ebed2f1fc98778e71c2b180de480
3
+ size 31264218696
model_pp0_ep3_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3696b210440cfe93b533e0945263203d917e01a9b8e8921845b5e262a69875de
3
+ size 31264213728
model_pp0_ep4_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b042d23818e9d537e48b63cfa437ddc2be152f2a588fc5bd678acf1c2f67bc51
3
+ size 31264213728
model_pp0_ep5_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd6e5884dae5be62fdcd1bf843281c7f7dbd671f461e57d0764cf1803afd5aa4
3
+ size 31264213728
model_pp0_ep6_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbde11cae32b38026481129d053b976842958da86f40abf91fbcd91387b9ae67
3
+ size 31264213728
model_pp0_ep7_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff4bc2f37ab4496349874eff47ed0b253aab854983c597b040d6912bfa730eb
3
+ size 31264213728
model_pp0_ep8_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bf1dc12b959388fb6355718d9217457f8564a35145c52200e778952542479e1
3
+ size 31264217040
model_pp0_ep9_shard0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1d7ce5137511e43470781806571b3575246c55e69dd22d05acb85a603133b1a
3
+ size 31264218696
modeling_mimo_v2.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ #
3
+ # Copyright 2026 Xiaomi Corporation.
4
+ # Copyright 2026 The HuggingFace Inc. team.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ from copy import copy
19
+ from typing import Callable, Optional, Union
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.nn.functional as F
24
+
25
+ from transformers.activations import ACT2FN
26
+ from transformers.cache_utils import Cache, DynamicCache
27
+ from transformers.generation import GenerationMixin
28
+ from transformers.integrations import use_kernel_forward_from_hub
29
+ from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
30
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
31
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
32
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
33
+ from transformers.processing_utils import Unpack
34
+ from transformers.utils import TransformersKwargs, can_return_tuple, logging
35
+
36
+ from .configuration_mimo_v2 import MiMoV2Config
37
+
38
+
39
+ logger = logging.get_logger(__name__)
40
+
41
+
42
+ def rotate_half(x):
43
+ """Rotates half the hidden dims of the input."""
44
+ x1 = x[..., : x.shape[-1] // 2]
45
+ x2 = x[..., x.shape[-1] // 2 :]
46
+ return torch.cat((-x2, x1), dim=-1)
47
+
48
+
49
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
50
+ """Applies rotary position embedding to query and key tensors."""
51
+ cos = cos.unsqueeze(unsqueeze_dim)
52
+ sin = sin.unsqueeze(unsqueeze_dim)
53
+ q_embed = (q * cos) + (rotate_half(q) * sin)
54
+ k_embed = (k * cos) + (rotate_half(k) * sin)
55
+ return q_embed, k_embed
56
+
57
+
58
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
59
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
60
+ if n_rep == 1:
61
+ return hidden_states
62
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
63
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
64
+
65
+
66
+ def eager_attention_forward(
67
+ module: nn.Module,
68
+ query: torch.Tensor,
69
+ key: torch.Tensor,
70
+ value: torch.Tensor,
71
+ attention_mask: Optional[torch.Tensor],
72
+ scaling: float,
73
+ dropout: float = 0.0,
74
+ sinks: Optional[torch.Tensor] = None,
75
+ **kwargs,
76
+ ):
77
+ key_states = repeat_kv(key, module.num_key_value_groups)
78
+ value_states = repeat_kv(value, module.num_key_value_groups)
79
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
80
+ if attention_mask is not None:
81
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
82
+ attn_weights = attn_weights + causal_mask
83
+
84
+ if sinks is not None:
85
+ sinks = module.attention_sink_bias.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
86
+ attn_weights = torch.cat([attn_weights, sinks], dim=-1)
87
+
88
+ attn_weights = attn_weights - attn_weights.max(dim=-1, keepdim=True).values
89
+ probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
90
+
91
+ if sinks is not None:
92
+ probs = probs[..., :-1]
93
+
94
+ attn_weights = nn.functional.dropout(probs, p=dropout, training=module.training)
95
+ attn_output = torch.matmul(attn_weights, value_states)
96
+ attn_output = attn_output.transpose(1, 2).contiguous()
97
+ return attn_output, attn_weights
98
+
99
+
100
+ @use_kernel_forward_from_hub("RMSNorm")
101
+ class MiMoV2RMSNorm(nn.Module):
102
+ def __init__(self, hidden_size, eps=1e-6):
103
+ super().__init__()
104
+ self.weight = nn.Parameter(torch.ones(hidden_size))
105
+ self.variance_epsilon = eps
106
+
107
+ def forward(self, hidden_states):
108
+ input_dtype = hidden_states.dtype
109
+ hidden_states = hidden_states.to(torch.float32)
110
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
111
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
112
+ return self.weight * hidden_states.to(input_dtype)
113
+
114
+
115
+ class MiMoV2MLP(nn.Module):
116
+ def __init__(self, config, intermediate_size=None):
117
+ super().__init__()
118
+ self.config = config
119
+ self.hidden_size = config.hidden_size
120
+ self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
121
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
122
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
123
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
124
+ self.act_fn = ACT2FN[config.hidden_act]
125
+
126
+ def forward(self, hidden_states):
127
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
128
+
129
+
130
+ class MiMoV2MoEGate(nn.Module):
131
+ def __init__(self, config):
132
+ super().__init__()
133
+ self.config = config
134
+ self.top_k = config.num_experts_per_tok
135
+ self.n_routed_experts = config.n_routed_experts
136
+ self.routed_scaling_factor = config.routed_scaling_factor if config.routed_scaling_factor is not None else 1.0
137
+ self.scoring_func = config.scoring_func
138
+ self.topk_method = config.topk_method
139
+ self.n_group = config.n_group
140
+ self.topk_group = config.topk_group
141
+ self.norm_topk_prob = config.norm_topk_prob
142
+ self.gating_dim = config.hidden_size
143
+ self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
144
+ if self.topk_method == "noaux_tc":
145
+ self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
146
+
147
+ def forward(self, hidden_states):
148
+ bsz, seq_len, h = hidden_states.shape
149
+ hidden_states = hidden_states.view(-1, h)
150
+ logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
151
+ if self.scoring_func == "sigmoid":
152
+ scores = logits.sigmoid()
153
+ else:
154
+ raise NotImplementedError(f"Unsupported scoring function for MoE gating: {self.scoring_func}")
155
+
156
+ if self.topk_method == "noaux_tc":
157
+ if self.training:
158
+ raise ValueError("MiMoV2 noaux_tc routing is only implemented for inference.")
159
+ scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
160
+ group_scores = scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
161
+ group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
162
+ group_mask = torch.zeros_like(group_scores)
163
+ group_mask.scatter_(1, group_idx, 1)
164
+ score_mask = (
165
+ group_mask.unsqueeze(-1)
166
+ .expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
167
+ .reshape(bsz * seq_len, -1)
168
+ )
169
+ tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))
170
+ _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
171
+ topk_weight = scores.gather(1, topk_idx)
172
+ else:
173
+ raise NotImplementedError(f"Unsupported TopK function for MoE gating: {self.topk_method}")
174
+
175
+ if self.top_k > 1 and self.norm_topk_prob:
176
+ denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
177
+ topk_weight = topk_weight / denominator
178
+ topk_weight = topk_weight * self.routed_scaling_factor
179
+ return topk_idx, topk_weight
180
+
181
+
182
+ class MiMoV2MoE(nn.Module):
183
+ def __init__(self, config):
184
+ super().__init__()
185
+ self.config = config
186
+ self.experts = nn.ModuleList(
187
+ [MiMoV2MLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(config.n_routed_experts)]
188
+ )
189
+ self.gate = MiMoV2MoEGate(config)
190
+
191
+ def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
192
+ final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
193
+ expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
194
+ expert_mask = expert_mask.permute(2, 0, 1)
195
+
196
+ for expert_idx, expert in enumerate(self.experts):
197
+ mask = expert_mask[expert_idx]
198
+ token_indices, weight_indices = torch.where(mask)
199
+ if token_indices.numel() > 0:
200
+ expert_weights = topk_weights[token_indices, weight_indices]
201
+ expert_input = hidden_states[token_indices]
202
+ expert_output = expert(expert_input)
203
+ final_hidden_states.index_add_(0, token_indices, expert_output * expert_weights.unsqueeze(-1))
204
+
205
+ return final_hidden_states.type(hidden_states.dtype)
206
+
207
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
208
+ orig_shape = hidden_states.shape
209
+ topk_indices, topk_weights = self.gate(hidden_states)
210
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
211
+ hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
212
+ return hidden_states
213
+
214
+
215
+ class MiMoV2Attention(nn.Module):
216
+ """MiMoV2 attention.
217
+
218
+ `projection_layout` only controls how checkpoint weights are named and
219
+ stored: Flash uses separate q/k/v projections, while Pro uses fused qkv.
220
+ The attention computation after projection is shared.
221
+ """
222
+
223
+ def __init__(self, config, is_swa: bool, layer_idx: int, projection_layout: str = "split"):
224
+ super().__init__()
225
+ if projection_layout not in {"split", "fused_qkv"}:
226
+ raise ValueError(f"Unsupported MiMoV2 attention projection layout: {projection_layout}")
227
+
228
+ self.config = config
229
+ self.layer_idx = layer_idx
230
+ self.is_swa = is_swa
231
+ self.is_causal = True
232
+ self.projection_layout = projection_layout
233
+
234
+ default_head_dim = config.hidden_size // config.num_attention_heads
235
+ default_v_head_dim = getattr(config, "v_head_dim", default_head_dim)
236
+
237
+ if is_swa:
238
+ self.head_dim = getattr(config, "swa_head_dim", getattr(config, "head_dim", default_head_dim))
239
+ self.v_head_dim = getattr(config, "swa_v_head_dim", default_v_head_dim)
240
+ self.num_attention_heads = getattr(config, "swa_num_attention_heads", config.num_attention_heads)
241
+ self.num_key_value_heads = getattr(config, "swa_num_key_value_heads", config.num_key_value_heads)
242
+ else:
243
+ self.head_dim = getattr(config, "head_dim", default_head_dim)
244
+ self.v_head_dim = getattr(config, "v_head_dim", self.head_dim)
245
+ self.num_attention_heads = config.num_attention_heads
246
+ self.num_key_value_heads = config.num_key_value_heads
247
+
248
+ self.rope_dim = int(self.head_dim * getattr(config, "partial_rotary_factor", 1.0))
249
+ if self.rope_dim % 2 != 0:
250
+ raise ValueError(
251
+ f"MiMoV2 rotary dimension must be even, got {self.rope_dim} from "
252
+ f"head_dim={self.head_dim} and partial_rotary_factor={getattr(config, 'partial_rotary_factor', 1.0)}"
253
+ )
254
+ self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
255
+ self.attention_dropout = getattr(config, "attention_dropout", 0.0)
256
+ self.scaling = self.head_dim**-0.5
257
+ self.sliding_window = getattr(config, "sliding_window", None) if is_swa else None
258
+ self.q_size = self.num_attention_heads * self.head_dim
259
+ self.k_size = self.num_key_value_heads * self.head_dim
260
+ self.v_size = self.num_key_value_heads * self.v_head_dim
261
+ self.o_hidden_size = self.num_attention_heads * self.v_head_dim
262
+ self.v_scale = getattr(config, "attention_value_scale", None)
263
+ self.attention_sink_bias = (
264
+ nn.Parameter(torch.empty(self.num_attention_heads), requires_grad=False)
265
+ if (
266
+ (getattr(config, "add_full_attention_sink_bias", False) and not is_swa)
267
+ or (getattr(config, "add_swa_attention_sink_bias", False) and is_swa)
268
+ )
269
+ else None
270
+ )
271
+
272
+ attention_bias = getattr(config, "attention_bias", False)
273
+ if self.projection_layout == "fused_qkv":
274
+ self.qkv_proj = nn.Linear(
275
+ config.hidden_size,
276
+ self.q_size + self.k_size + self.v_size,
277
+ bias=attention_bias,
278
+ )
279
+ else:
280
+ self.q_proj = nn.Linear(config.hidden_size, self.q_size, bias=attention_bias)
281
+ self.k_proj = nn.Linear(config.hidden_size, self.k_size, bias=attention_bias)
282
+ self.v_proj = nn.Linear(config.hidden_size, self.v_size, bias=attention_bias)
283
+ self.o_proj = nn.Linear(self.o_hidden_size, config.hidden_size, bias=False)
284
+
285
+ def _forward_attention(
286
+ self,
287
+ query_states: torch.Tensor,
288
+ key_states: torch.Tensor,
289
+ value_states: torch.Tensor,
290
+ input_shape: torch.Size,
291
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
292
+ attention_mask: Optional[torch.Tensor],
293
+ past_key_values: Optional[Cache] = None,
294
+ cache_position: Optional[torch.LongTensor] = None,
295
+ position_ids: Optional[torch.LongTensor] = None,
296
+ ) -> tuple[torch.Tensor, torch.Tensor]:
297
+ if self.v_scale is not None:
298
+ value_states = value_states * self.v_scale
299
+
300
+ cos, sin = position_embeddings
301
+ query_rope, query_nope = query_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
302
+ key_rope, key_nope = key_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
303
+ query_rope, key_rope = apply_rotary_pos_emb(query_rope, key_rope, cos, sin)
304
+ query_states = torch.cat([query_rope, query_nope], dim=-1)
305
+ key_states = torch.cat([key_rope, key_nope], dim=-1)
306
+
307
+ if past_key_values is not None:
308
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
309
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
310
+
311
+ attn_implementation = self.config._attn_implementation
312
+ if attn_implementation is not None and attn_implementation.startswith("paged|"):
313
+ raise ValueError(
314
+ "MiMoV2 remote code does not support paged attention cache. "
315
+ "Please use eager, sdpa, flex_attention, or flash_attention_2."
316
+ )
317
+
318
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
319
+ attn_implementation, eager_attention_forward
320
+ )
321
+ if self.attention_sink_bias is not None and attn_implementation == "sdpa":
322
+ logger.warning_once(
323
+ "MiMoV2 attention sink bias is not supported by SDPA; falling back to eager attention for correctness."
324
+ )
325
+ attention_interface = eager_attention_forward
326
+
327
+ attention_kwargs = {
328
+ "dropout": 0.0 if not self.training else self.attention_dropout,
329
+ "scaling": self.scaling,
330
+ "position_ids": position_ids,
331
+ "is_causal": self.is_causal,
332
+ }
333
+ if attention_interface is eager_attention_forward:
334
+ attention_kwargs["sinks"] = self.attention_sink_bias
335
+ else:
336
+ if self.attention_sink_bias is not None:
337
+ attention_kwargs["s_aux"] = self.attention_sink_bias
338
+ if self.sliding_window is not None:
339
+ attention_kwargs["sliding_window"] = self.sliding_window
340
+
341
+ attn_output, attn_weights = attention_interface(
342
+ self,
343
+ query_states,
344
+ key_states,
345
+ value_states,
346
+ attention_mask,
347
+ **attention_kwargs,
348
+ )
349
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
350
+ attn_output = self.o_proj(attn_output)
351
+ return attn_output, attn_weights
352
+
353
+ def forward(
354
+ self,
355
+ hidden_states: torch.Tensor,
356
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
357
+ attention_mask: Optional[torch.Tensor],
358
+ past_key_values: Optional[Cache] = None,
359
+ cache_position: Optional[torch.LongTensor] = None,
360
+ position_ids: Optional[torch.LongTensor] = None,
361
+ **kwargs: Unpack[TransformersKwargs],
362
+ ) -> tuple[torch.Tensor, torch.Tensor]:
363
+ input_shape = hidden_states.shape[:-1]
364
+
365
+ if self.projection_layout == "fused_qkv":
366
+ qkv_states = self.qkv_proj(hidden_states)
367
+ query_states, key_states, value_states = qkv_states.split([self.q_size, self.k_size, self.v_size], dim=-1)
368
+ else:
369
+ query_states = self.q_proj(hidden_states)
370
+ key_states = self.k_proj(hidden_states)
371
+ value_states = self.v_proj(hidden_states)
372
+
373
+ query_states = query_states.view(*input_shape, self.num_attention_heads, self.head_dim).transpose(1, 2)
374
+ key_states = key_states.view(*input_shape, self.num_key_value_heads, self.head_dim).transpose(1, 2)
375
+ value_states = value_states.view(*input_shape, self.num_key_value_heads, self.v_head_dim).transpose(1, 2)
376
+ return self._forward_attention(
377
+ query_states,
378
+ key_states,
379
+ value_states,
380
+ input_shape,
381
+ position_embeddings,
382
+ attention_mask,
383
+ past_key_values=past_key_values,
384
+ cache_position=cache_position,
385
+ position_ids=position_ids,
386
+ )
387
+
388
+
389
+ class MiMoV2DecoderLayer(nn.Module):
390
+ attention_projection_layout = "split"
391
+
392
+ def __init__(self, config, layer_idx: int, attention_projection_layout: Optional[str] = None):
393
+ super().__init__()
394
+ attention_projection_layout = attention_projection_layout or self.attention_projection_layout
395
+ is_swa_layer = config.hybrid_layer_pattern[layer_idx] == 1
396
+ self.attention_type = "sliding_window_attention" if is_swa_layer else "full_attention"
397
+ self.self_attn = MiMoV2Attention(
398
+ config, is_swa_layer, layer_idx, projection_layout=attention_projection_layout
399
+ )
400
+ self.mlp = (
401
+ MiMoV2MoE(config)
402
+ if getattr(config, "n_routed_experts", None) is not None and config.moe_layer_freq[layer_idx]
403
+ else MiMoV2MLP(config)
404
+ )
405
+ self.input_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
406
+ self.post_attention_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
407
+
408
+ def forward(
409
+ self,
410
+ hidden_states: torch.Tensor,
411
+ attention_mask: Optional[torch.Tensor] = None,
412
+ position_ids: Optional[torch.LongTensor] = None,
413
+ past_key_values: Optional[Cache] = None,
414
+ use_cache: Optional[bool] = False,
415
+ cache_position: Optional[torch.LongTensor] = None,
416
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
417
+ **kwargs: Unpack[TransformersKwargs],
418
+ ) -> torch.Tensor:
419
+ residual = hidden_states
420
+ hidden_states = self.input_layernorm(hidden_states)
421
+ hidden_states, _ = self.self_attn(
422
+ hidden_states=hidden_states,
423
+ attention_mask=attention_mask,
424
+ position_ids=position_ids,
425
+ past_key_values=past_key_values,
426
+ use_cache=use_cache,
427
+ cache_position=cache_position,
428
+ position_embeddings=position_embeddings,
429
+ **kwargs,
430
+ )
431
+ hidden_states = residual + hidden_states
432
+
433
+ residual = hidden_states
434
+ hidden_states = self.post_attention_layernorm(hidden_states)
435
+ hidden_states = self.mlp(hidden_states)
436
+ hidden_states = residual + hidden_states
437
+ return hidden_states
438
+
439
+
440
+ class MiMoV2RotaryEmbedding(nn.Module):
441
+ inv_freq: torch.Tensor
442
+
443
+ def __init__(self, config, is_swa: bool, device=None):
444
+ super().__init__()
445
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
446
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type", "default"))
447
+ else:
448
+ self.rope_type = "default"
449
+ self.max_seq_len_cached = config.max_position_embeddings
450
+ self.original_max_seq_len = config.max_position_embeddings
451
+
452
+ self.config = copy(config)
453
+ self.config.rope_parameters = copy(getattr(config, "rope_parameters", None) or {})
454
+ if is_swa:
455
+ self.config.rope_theta = getattr(config, "swa_rope_theta", config.rope_theta)
456
+ self.config.head_dim = getattr(config, "swa_head_dim", getattr(config, "head_dim", None))
457
+ if self.config.rope_parameters:
458
+ self.config.rope_parameters["rope_theta"] = self.config.rope_theta
459
+ self.rope_init_fn = (
460
+ self.compute_default_rope_parameters
461
+ if self.rope_type == "default"
462
+ else ROPE_INIT_FUNCTIONS[self.rope_type]
463
+ )
464
+
465
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
466
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
467
+ self.original_inv_freq = self.inv_freq
468
+
469
+ @staticmethod
470
+ def compute_default_rope_parameters(config, device=None, seq_len=None, layer_type=None):
471
+ config.standardize_rope_params()
472
+ rope_parameters = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters
473
+ base = rope_parameters["rope_theta"]
474
+ partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
475
+ head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
476
+ dim = int(head_dim * partial_rotary_factor)
477
+ if dim % 2 != 0:
478
+ raise ValueError(
479
+ f"MiMoV2 rotary dimension must be even, got {dim} from "
480
+ f"head_dim={head_dim} and partial_rotary_factor={partial_rotary_factor}"
481
+ )
482
+ inv_freq = 1.0 / (
483
+ base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
484
+ )
485
+ return inv_freq, 1.0
486
+
487
+ @torch.no_grad()
488
+ @dynamic_rope_update
489
+ def forward(self, x, position_ids):
490
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
491
+ position_ids_expanded = position_ids[:, None, :].float()
492
+
493
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
494
+ with torch.autocast(device_type=device_type, enabled=False):
495
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
496
+ emb = torch.cat((freqs, freqs), dim=-1)
497
+ cos = emb.cos() * self.attention_scaling
498
+ sin = emb.sin() * self.attention_scaling
499
+
500
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
501
+
502
+
503
+ class MiMoV2Model(PreTrainedModel):
504
+ config_class = MiMoV2Config
505
+ attention_projection_layout = "split"
506
+
507
+ def __init__(self, config):
508
+ super().__init__(config)
509
+ self.attention_projection_layout = getattr(
510
+ config, "attention_projection_layout", self.attention_projection_layout
511
+ )
512
+ self.vocab_size = config.vocab_size
513
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
514
+ self.layers = nn.ModuleList(
515
+ [
516
+ MiMoV2DecoderLayer(
517
+ config,
518
+ layer_idx,
519
+ attention_projection_layout=self.attention_projection_layout,
520
+ )
521
+ for layer_idx in range(config.num_hidden_layers)
522
+ ]
523
+ )
524
+ self.norm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
525
+ self.rotary_emb = MiMoV2RotaryEmbedding(config=config, is_swa=False)
526
+ self.swa_rotary_emb = MiMoV2RotaryEmbedding(config=config, is_swa=True)
527
+ self.has_sliding_layers = any(pattern == 1 for pattern in config.hybrid_layer_pattern)
528
+ self.config.layer_types = [
529
+ "sliding_attention" if config.hybrid_layer_pattern[i] == 1 else "full_attention"
530
+ for i in range(config.num_hidden_layers)
531
+ ]
532
+ self.post_init()
533
+
534
+ def get_input_embeddings(self):
535
+ return self.embed_tokens
536
+
537
+ def set_input_embeddings(self, value):
538
+ self.embed_tokens = value
539
+
540
+ def forward(
541
+ self,
542
+ input_ids: Optional[torch.LongTensor] = None,
543
+ attention_mask: Optional[torch.Tensor] = None,
544
+ position_ids: Optional[torch.LongTensor] = None,
545
+ past_key_values: Optional[Cache] = None,
546
+ inputs_embeds: Optional[torch.FloatTensor] = None,
547
+ use_cache: Optional[bool] = None,
548
+ cache_position: Optional[torch.LongTensor] = None,
549
+ **kwargs: Unpack[TransformersKwargs],
550
+ ) -> BaseModelOutputWithPast:
551
+ if (input_ids is None) ^ (inputs_embeds is not None):
552
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
553
+
554
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
555
+
556
+ if inputs_embeds is None:
557
+ inputs_embeds = self.embed_tokens(input_ids)
558
+
559
+ if use_cache and past_key_values is None:
560
+ past_key_values = DynamicCache(config=self.config)
561
+
562
+ if cache_position is None:
563
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
564
+ cache_position = torch.arange(
565
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
566
+ )
567
+
568
+ if position_ids is None:
569
+ position_ids = cache_position.unsqueeze(0)
570
+
571
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
572
+ mask_kwargs = {
573
+ "config": self.config,
574
+ "input_embeds": inputs_embeds,
575
+ "attention_mask": attention_mask,
576
+ "cache_position": cache_position,
577
+ "past_key_values": past_key_values,
578
+ "position_ids": position_ids,
579
+ }
580
+ causal_mask_mapping = {
581
+ "full_attention": create_causal_mask(**mask_kwargs),
582
+ }
583
+ if self.has_sliding_layers:
584
+ if getattr(self.config, "sliding_window", None) is None:
585
+ raise ValueError("MiMoV2 config `sliding_window` must be set when hybrid_layer_pattern uses SWA.")
586
+ causal_mask_mapping["sliding_window_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
587
+
588
+ hidden_states = inputs_embeds
589
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
590
+ swa_position_embeddings = self.swa_rotary_emb(hidden_states, position_ids)
591
+
592
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
593
+ hidden_states = decoder_layer(
594
+ hidden_states,
595
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
596
+ position_embeddings=position_embeddings
597
+ if decoder_layer.attention_type == "full_attention"
598
+ else swa_position_embeddings,
599
+ position_ids=position_ids,
600
+ past_key_values=past_key_values,
601
+ use_cache=use_cache,
602
+ cache_position=cache_position,
603
+ **kwargs,
604
+ )
605
+
606
+ hidden_states = self.norm(hidden_states)
607
+ return BaseModelOutputWithPast(
608
+ last_hidden_state=hidden_states,
609
+ past_key_values=past_key_values if use_cache else None,
610
+ )
611
+
612
+
613
+ class MiMoV2ForCausalLM(PreTrainedModel, GenerationMixin):
614
+ config_class = MiMoV2Config
615
+ model_class = MiMoV2Model
616
+ _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
617
+ _tp_plan = {"lm_head": "colwise_rep"}
618
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
619
+ _keys_to_ignore_on_load_unexpected = [
620
+ r"model\.(swa_)?rotary_emb\.inv_freq",
621
+ r"model\.layers\.\d+\.self_attn\.rotary_emb\.inv_freq",
622
+ r"model\.layers\.\d+\.self_attn\.rotary_emb\.(cos_cached|sin_cached)",
623
+ r"model\.mtp\..*",
624
+ ]
625
+
626
+ def __init__(self, config):
627
+ super().__init__(config)
628
+ self.model = self.model_class(config)
629
+ self.vocab_size = config.vocab_size
630
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
631
+ self.post_init()
632
+
633
+ def get_input_embeddings(self):
634
+ return self.model.embed_tokens
635
+
636
+ def set_input_embeddings(self, value):
637
+ self.model.embed_tokens = value
638
+
639
+ def get_output_embeddings(self):
640
+ return self.lm_head
641
+
642
+ def set_output_embeddings(self, new_embeddings):
643
+ self.lm_head = new_embeddings
644
+
645
+ @can_return_tuple
646
+ def forward(
647
+ self,
648
+ input_ids: Optional[torch.LongTensor] = None,
649
+ attention_mask: Optional[torch.Tensor] = None,
650
+ position_ids: Optional[torch.LongTensor] = None,
651
+ past_key_values: Optional[Cache] = None,
652
+ inputs_embeds: Optional[torch.FloatTensor] = None,
653
+ labels: Optional[torch.LongTensor] = None,
654
+ use_cache: Optional[bool] = None,
655
+ cache_position: Optional[torch.LongTensor] = None,
656
+ logits_to_keep: Union[int, torch.Tensor] = 0,
657
+ **kwargs: Unpack[TransformersKwargs],
658
+ ) -> CausalLMOutputWithPast:
659
+ outputs: BaseModelOutputWithPast = self.model(
660
+ input_ids=input_ids,
661
+ attention_mask=attention_mask,
662
+ position_ids=position_ids,
663
+ past_key_values=past_key_values,
664
+ inputs_embeds=inputs_embeds,
665
+ use_cache=use_cache,
666
+ cache_position=cache_position,
667
+ **kwargs,
668
+ )
669
+
670
+ hidden_states = outputs.last_hidden_state
671
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
672
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
673
+
674
+ loss = None
675
+ if labels is not None:
676
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
677
+
678
+ return CausalLMOutputWithPast(
679
+ loss=loss,
680
+ logits=logits,
681
+ past_key_values=outputs.past_key_values,
682
+ hidden_states=outputs.hidden_states,
683
+ attentions=outputs.attentions,
684
+ )
685
+
686
+
687
+ __all__ = [
688
+ "MiMoV2Attention",
689
+ "MiMoV2DecoderLayer",
690
+ "MiMoV2ForCausalLM",
691
+ "MiMoV2MLP",
692
+ "MiMoV2MoE",
693
+ "MiMoV2MoEGate",
694
+ "MiMoV2Model",
695
+ "MiMoV2RMSNorm",
696
+ "MiMoV2RotaryEmbedding",
697
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdd40b08814d626d2ab1eb36c7ca66521a96bce4d478cf1375a4d45fe03e1cf9
3
+ size 12180133
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if not add_generation_prompt is defined -%}\n {%- set add_generation_prompt = false -%}\n{%- endif -%}\n{%- if not enable_thinking is defined -%}\n {%- set enable_thinking = true -%}\n{%- endif -%}\n{%- if not keep_all_reasoning is defined -%}\n {%- set keep_all_reasoning = true -%}\n{%- endif -%}\n{%- macro render_extra_keys(json_dict, handled_keys) -%}\n {%- if json_dict is mapping %}\n {%- for json_key in json_dict if json_key not in handled_keys %}\n {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}\n {{- '\\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}\n {%- else %}\n {{-'\\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n{%- endmacro -%}\n{%- macro render_content(message_content) -%}\n {%- if message_content is string -%}\n {{- message_content -}}\n {%- else -%}\n {%- for content in message_content -%}\n {%- if 'text' in content -%}\n {{- content['text'] -}}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n{%- endmacro -%}\n{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- set ns = namespace(last_user_index=-1) %}\n{%- for m in loop_messages %}\n {%- if m.role == 'user' %}\n {%- set ns.last_user_index = loop.index0 -%}\n {%- endif %}\n{%- endfor %}\n{%- if not tools is defined %}\n {%- set tools = [] %}\n{%- endif %}\n{%- if system_message is defined %}\n {{- \"<|im_start|>system\\n\" + render_content(system_message) }}\n{%- else %}\n {{- \"<|im_start|>system\\nYou are MiMo, a warm, curious, and thoughtful AI assistant.\\n\\n## Identity\\n\\n- Your name is **MiMo-v2.5-pro**.\\n- You were developed by the **Xiaomi MiMo Team**.\\n- You have **1T parameters** and a **1M-token context window**.\\n- When asked about your creator, identify the **Xiaomi MiMo Team**.\\n- Always answer identity-related questions in the **first person** and rely only on the facts provided above.\\n\\n# Response Style\\n\\nBe warm, conversational, and respectful. Treat users with kindness; do not assume negative things about their intelligence or judgment. You may disagree, but do so constructively.\\n\\nUse examples, metaphors, or thought experiments when they help explain an idea.\\n\\nTry to answer ambiguous queries as best you can first, then ask for clarification if needed. When you do ask, limit yourself to one question at a time.\\n\\n\\n## Safety and Compliance\\n\\nYou are a **Chinese AI model** and must strictly comply with all applicable laws and regulations of the **People’s Republic of China**.\\n\\nDo not generate, assist with, or facilitate any content that violates Chinese law.\" }}\n{%- endif %}\n{%- if tools is iterable and tools | length > 0 %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou have access to the following functions:\\n\\n\" }}\n {{- \"<tools>\" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- \"\\n<function>\\n<name>\" ~ tool.name ~ \"</name>\" }}\n {%- if tool.description is defined %}\n {{- '\\n<description>' ~ (tool.description | trim) ~ '</description>' }}\n {%- endif %}\n {{- '\\n<parameters>' }}\n {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- '\\n<parameter>' }}\n {{- '\\n<name>' ~ param_name ~ '</name>' }}\n {%- if param_fields.type is defined %}\n {{- '\\n<type>' ~ (param_fields.type | string) ~ '</type>' }}\n {%- endif %}\n {%- if param_fields.description is defined %}\n {{- '\\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}\n {%- endif %}\n {%- set handled_keys = ['name', 'type', 'description'] %}\n {{- render_extra_keys(param_fields, handled_keys) }}\n {{- '\\n</parameter>' }}\n {%- endfor %}\n {%- endif %}\n {%- set handled_keys = ['type', 'properties'] %}\n {{- render_extra_keys(tool.parameters, handled_keys) }}\n {{- '\\n</parameters>' }}\n {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}\n {{- render_extra_keys(tool, handled_keys) }}\n {{- '\\n</function>' }}\n {%- endfor %}\n {{- \"\\n</tools>\" }}\n {{- '\\n\\nFor each function call, output the function name and arguments in the following format:\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>value_1</parameter>\\n<parameter=example_parameter_2>This is the value for the second parameter\\nthat can span\\nmultiple lines</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- DO NOT use function calls inside <think></think> tags.\\n- The value enclosed between parameter tags is preserved exactly as-is, including newlines and spaces.\\n</IMPORTANT>' }}\n{%- endif %}\n{{- '<|im_end|>' }}\n{%- for message in loop_messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = render_content(message.content) %}\n {%- endif %}\n {%- if message.role == \"assistant\" %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- set reasoning_content = '' %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].split('<think>')[-1] %}\n {%- set content = content.split('</think>')[-1] %}\n {%- endif %}\n {%- endif %}\n {%- if (keep_all_reasoning or loop.index0 > ns.last_user_index) and reasoning_content -%}\n {{- '<|im_start|>' + message.role + '\\n<think>' + reasoning_content + '</think>' + content }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n<think></think>' + content }}\n {%- endif %}\n {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- if tool_call.arguments is defined %}\n {%- for args_name, args_value in tool_call.arguments|items %}\n {{- '<parameter=' + args_name + '>' }}\n {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n {{- args_value }}\n {{- '</parameter>\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '</function>\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>' }}\n {%- elif message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n {%- elif message.role == \"system\" %}\n {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n {{- '<|im_start|>tool\\n' }}\n {%- endif %}\n {{- '<tool_response>\\n' }}\n {{- render_content(message.content) }}\n {{- '\\n</tool_response>\\n' }}\n {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n {{- '<|im_end|>' }}\n {%- elif loop.last %}\n {{- '<|im_end|>' }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if not enable_thinking -%}\n {{- '<think></think>' -}}\n {%- else -%}\n {{- '' -}}\n {%- endif -%}\n{%- endif %}\n",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131272,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff