Commit ·
b0739a6
0
Parent(s):
Duplicate from yappertar4/YAPPERTAR-Large-V1
Browse files- .gitattributes +40 -0
- README.md +10 -0
- added_tokens.json +28 -0
- assets/architecture.png +3 -0
- assets/benchmark.jpg +3 -0
- assets/post_training_evaluation.jpg +3 -0
- config.json +156 -0
- configuration_mimo_v2.py +209 -0
- merges.txt +0 -0
- model.safetensors.index.json +3 -0
- model_mtp.safetensors +3 -0
- model_pp0_ep0_shard0.safetensors +3 -0
- model_pp0_ep0_shard1.safetensors +3 -0
- model_pp0_ep10_shard0.safetensors +3 -0
- model_pp0_ep11_shard0.safetensors +3 -0
- model_pp0_ep12_shard0.safetensors +3 -0
- model_pp0_ep13_shard0.safetensors +3 -0
- model_pp0_ep14_shard0.safetensors +3 -0
- model_pp0_ep15_shard0.safetensors +3 -0
- model_pp0_ep16_shard0.safetensors +3 -0
- model_pp0_ep17_shard0.safetensors +3 -0
- model_pp0_ep18_shard0.safetensors +3 -0
- model_pp0_ep19_shard0.safetensors +3 -0
- model_pp0_ep1_shard0.safetensors +3 -0
- model_pp0_ep20_shard0.safetensors +3 -0
- model_pp0_ep21_shard0.safetensors +3 -0
- model_pp0_ep22_shard0.safetensors +3 -0
- model_pp0_ep23_shard0.safetensors +3 -0
- model_pp0_ep24_shard0.safetensors +3 -0
- model_pp0_ep25_shard0.safetensors +3 -0
- model_pp0_ep26_shard0.safetensors +3 -0
- model_pp0_ep27_shard0.safetensors +3 -0
- model_pp0_ep28_shard0.safetensors +3 -0
- model_pp0_ep29_shard0.safetensors +3 -0
- model_pp0_ep2_shard0.safetensors +3 -0
- model_pp0_ep30_shard0.safetensors +3 -0
- model_pp0_ep31_shard0.safetensors +3 -0
- model_pp0_ep3_shard0.safetensors +3 -0
- model_pp0_ep4_shard0.safetensors +3 -0
- model_pp0_ep5_shard0.safetensors +3 -0
- model_pp0_ep6_shard0.safetensors +3 -0
- model_pp0_ep7_shard0.safetensors +3 -0
- model_pp0_ep8_shard0.safetensors +3 -0
- model_pp0_ep9_shard0.safetensors +3 -0
- modeling_mimo_v2.py +697 -0
- special_tokens_map.json +31 -0
- tokenizer.json +3 -0
- tokenizer_config.json +240 -0
- vocab.json +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/architecture.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/benchmark.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/post_training_evaluation.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
- zh
|
| 6 |
+
tags:
|
| 7 |
+
- text-generation
|
| 8 |
+
- agent
|
| 9 |
+
- code
|
| 10 |
+
---
|
added_tokens.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</think>": 151668,
|
| 3 |
+
"</tool_call>": 151658,
|
| 4 |
+
"</tool_response>": 151666,
|
| 5 |
+
"<think>": 151667,
|
| 6 |
+
"<tool_call>": 151657,
|
| 7 |
+
"<tool_response>": 151665,
|
| 8 |
+
"<|box_end|>": 151649,
|
| 9 |
+
"<|box_start|>": 151648,
|
| 10 |
+
"<|endoftext|>": 151643,
|
| 11 |
+
"<|file_sep|>": 151664,
|
| 12 |
+
"<|fim_middle|>": 151660,
|
| 13 |
+
"<|fim_pad|>": 151662,
|
| 14 |
+
"<|fim_prefix|>": 151659,
|
| 15 |
+
"<|fim_suffix|>": 151661,
|
| 16 |
+
"<|im_end|>": 151645,
|
| 17 |
+
"<|im_start|>": 151644,
|
| 18 |
+
"<|image_pad|>": 151655,
|
| 19 |
+
"<|object_ref_end|>": 151647,
|
| 20 |
+
"<|object_ref_start|>": 151646,
|
| 21 |
+
"<|quad_end|>": 151651,
|
| 22 |
+
"<|quad_start|>": 151650,
|
| 23 |
+
"<|repo_name|>": 151663,
|
| 24 |
+
"<|video_pad|>": 151656,
|
| 25 |
+
"<|vision_end|>": 151653,
|
| 26 |
+
"<|vision_pad|>": 151654,
|
| 27 |
+
"<|vision_start|>": 151652
|
| 28 |
+
}
|
assets/architecture.png
ADDED
|
Git LFS Details
|
assets/benchmark.jpg
ADDED
|
Git LFS Details
|
assets/post_training_evaluation.jpg
ADDED
|
Git LFS Details
|
config.json
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"MiMoV2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "configuration_mimo_v2.MiMoV2Config",
|
| 7 |
+
"AutoModel": "modeling_mimo_v2.MiMoV2Model",
|
| 8 |
+
"AutoModelForCausalLM": "modeling_mimo_v2.MiMoV2ForCausalLM"
|
| 9 |
+
},
|
| 10 |
+
"quantization_config": {
|
| 11 |
+
"activation_scheme": "dynamic",
|
| 12 |
+
"fmt": "e4m3",
|
| 13 |
+
"quant_method": "fp8",
|
| 14 |
+
"weight_block_size": [
|
| 15 |
+
128,
|
| 16 |
+
128
|
| 17 |
+
],
|
| 18 |
+
"ignored_layers": [
|
| 19 |
+
"model.layers.0.self_attn.o_proj",
|
| 20 |
+
"model.layers.1.self_attn.o_proj",
|
| 21 |
+
"model.layers.2.self_attn.o_proj",
|
| 22 |
+
"model.layers.3.self_attn.o_proj",
|
| 23 |
+
"model.layers.4.self_attn.o_proj",
|
| 24 |
+
"model.layers.5.self_attn.o_proj",
|
| 25 |
+
"model.layers.6.self_attn.o_proj",
|
| 26 |
+
"model.layers.7.self_attn.o_proj",
|
| 27 |
+
"model.layers.8.self_attn.o_proj",
|
| 28 |
+
"model.layers.9.self_attn.o_proj",
|
| 29 |
+
"model.layers.10.self_attn.o_proj",
|
| 30 |
+
"model.layers.11.self_attn.o_proj",
|
| 31 |
+
"model.layers.12.self_attn.o_proj",
|
| 32 |
+
"model.layers.13.self_attn.o_proj",
|
| 33 |
+
"model.layers.14.self_attn.o_proj",
|
| 34 |
+
"model.layers.15.self_attn.o_proj",
|
| 35 |
+
"model.layers.16.self_attn.o_proj",
|
| 36 |
+
"model.layers.17.self_attn.o_proj",
|
| 37 |
+
"model.layers.18.self_attn.o_proj",
|
| 38 |
+
"model.layers.19.self_attn.o_proj",
|
| 39 |
+
"model.layers.20.self_attn.o_proj",
|
| 40 |
+
"model.layers.21.self_attn.o_proj",
|
| 41 |
+
"model.layers.22.self_attn.o_proj",
|
| 42 |
+
"model.layers.23.self_attn.o_proj",
|
| 43 |
+
"model.layers.24.self_attn.o_proj",
|
| 44 |
+
"model.layers.25.self_attn.o_proj",
|
| 45 |
+
"model.layers.26.self_attn.o_proj",
|
| 46 |
+
"model.layers.27.self_attn.o_proj",
|
| 47 |
+
"model.layers.28.self_attn.o_proj",
|
| 48 |
+
"model.layers.29.self_attn.o_proj",
|
| 49 |
+
"model.layers.30.self_attn.o_proj",
|
| 50 |
+
"model.layers.31.self_attn.o_proj",
|
| 51 |
+
"model.layers.32.self_attn.o_proj",
|
| 52 |
+
"model.layers.33.self_attn.o_proj",
|
| 53 |
+
"model.layers.34.self_attn.o_proj",
|
| 54 |
+
"model.layers.35.self_attn.o_proj",
|
| 55 |
+
"model.layers.36.self_attn.o_proj",
|
| 56 |
+
"model.layers.37.self_attn.o_proj",
|
| 57 |
+
"model.layers.38.self_attn.o_proj",
|
| 58 |
+
"model.layers.39.self_attn.o_proj",
|
| 59 |
+
"model.layers.40.self_attn.o_proj",
|
| 60 |
+
"model.layers.41.self_attn.o_proj",
|
| 61 |
+
"model.layers.42.self_attn.o_proj",
|
| 62 |
+
"model.layers.43.self_attn.o_proj",
|
| 63 |
+
"model.layers.44.self_attn.o_proj",
|
| 64 |
+
"model.layers.45.self_attn.o_proj",
|
| 65 |
+
"model.layers.46.self_attn.o_proj",
|
| 66 |
+
"model.layers.47.self_attn.o_proj",
|
| 67 |
+
"model.layers.48.self_attn.o_proj",
|
| 68 |
+
"model.layers.49.self_attn.o_proj",
|
| 69 |
+
"model.layers.50.self_attn.o_proj",
|
| 70 |
+
"model.layers.51.self_attn.o_proj",
|
| 71 |
+
"model.layers.52.self_attn.o_proj",
|
| 72 |
+
"model.layers.53.self_attn.o_proj",
|
| 73 |
+
"model.layers.54.self_attn.o_proj",
|
| 74 |
+
"model.layers.55.self_attn.o_proj",
|
| 75 |
+
"model.layers.56.self_attn.o_proj",
|
| 76 |
+
"model.layers.57.self_attn.o_proj",
|
| 77 |
+
"model.layers.58.self_attn.o_proj",
|
| 78 |
+
"model.layers.59.self_attn.o_proj",
|
| 79 |
+
"model.layers.60.self_attn.o_proj",
|
| 80 |
+
"model.layers.61.self_attn.o_proj",
|
| 81 |
+
"model.layers.62.self_attn.o_proj",
|
| 82 |
+
"model.layers.63.self_attn.o_proj",
|
| 83 |
+
"model.layers.64.self_attn.o_proj",
|
| 84 |
+
"model.layers.65.self_attn.o_proj",
|
| 85 |
+
"model.layers.66.self_attn.o_proj",
|
| 86 |
+
"model.layers.67.self_attn.o_proj",
|
| 87 |
+
"model.layers.68.self_attn.o_proj",
|
| 88 |
+
"model.layers.69.self_attn.o_proj",
|
| 89 |
+
"model.decoder.self_attn.o_proj"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
"add_full_attention_sink_bias": false,
|
| 93 |
+
"add_swa_attention_sink_bias": true,
|
| 94 |
+
"attention_bias": false,
|
| 95 |
+
"attention_chunk_size": 128,
|
| 96 |
+
"attention_dropout": 0.0,
|
| 97 |
+
"attention_projection_layout": "fused_qkv",
|
| 98 |
+
"attention_value_scale": 0.612,
|
| 99 |
+
"head_dim": 192,
|
| 100 |
+
"hidden_act": "silu",
|
| 101 |
+
"hidden_size": 6144,
|
| 102 |
+
"hybrid_layer_pattern": [
|
| 103 |
+
0, 1, 1, 1, 1, 1, 1,
|
| 104 |
+
0, 1, 1, 1, 1, 1, 1, 1,
|
| 105 |
+
0, 1, 1, 1, 1, 1, 1, 1,
|
| 106 |
+
0, 1, 1, 1, 1, 1, 1, 1,
|
| 107 |
+
0, 1, 1, 1, 1, 1, 1, 1,
|
| 108 |
+
0, 1, 1, 1, 1, 1, 1, 1,
|
| 109 |
+
0, 1, 1, 1, 1, 1, 1, 1,
|
| 110 |
+
0, 1, 1, 1, 1, 1, 1,
|
| 111 |
+
0, 1, 1, 1, 1, 1, 1,
|
| 112 |
+
0
|
| 113 |
+
],
|
| 114 |
+
"initializer_range": 0.02,
|
| 115 |
+
"intermediate_size": 16384,
|
| 116 |
+
"layernorm_epsilon": 1e-05,
|
| 117 |
+
"max_position_embeddings": 1048576,
|
| 118 |
+
"model_type": "mimo_v2",
|
| 119 |
+
"moe_intermediate_size": 2048,
|
| 120 |
+
"moe_layer_freq": [
|
| 121 |
+
0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 122 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 123 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 124 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 125 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 126 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
| 127 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
| 128 |
+
],
|
| 129 |
+
"n_group": 1,
|
| 130 |
+
"n_routed_experts": 384,
|
| 131 |
+
"n_shared_experts": null,
|
| 132 |
+
"norm_topk_prob": true,
|
| 133 |
+
"num_attention_heads": 128,
|
| 134 |
+
"num_experts_per_tok": 8,
|
| 135 |
+
"num_hidden_layers": 70,
|
| 136 |
+
"num_key_value_heads": 8,
|
| 137 |
+
"partial_rotary_factor": 0.334,
|
| 138 |
+
"rope_theta": 10000000,
|
| 139 |
+
"routed_scaling_factor": null,
|
| 140 |
+
"scoring_func": "sigmoid",
|
| 141 |
+
"sliding_window": 128,
|
| 142 |
+
"sliding_window_size": 128,
|
| 143 |
+
"swa_head_dim": 192,
|
| 144 |
+
"swa_num_attention_heads": 128,
|
| 145 |
+
"swa_num_key_value_heads": 8,
|
| 146 |
+
"swa_rope_theta": 10000,
|
| 147 |
+
"swa_v_head_dim": 128,
|
| 148 |
+
"tie_word_embeddings": false,
|
| 149 |
+
"topk_group": 1,
|
| 150 |
+
"topk_method": "noaux_tc",
|
| 151 |
+
"torch_dtype": "bfloat16",
|
| 152 |
+
"transformers_version": "4.57.1",
|
| 153 |
+
"use_cache": true,
|
| 154 |
+
"v_head_dim": 128,
|
| 155 |
+
"vocab_size": 152576
|
| 156 |
+
}
|
configuration_mimo_v2.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
#
|
| 3 |
+
# Copyright 2026 Xiaomi Corporation.
|
| 4 |
+
# Copyright 2026 The HuggingFace Inc. team.
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 7 |
+
# you may not use this file except in compliance with the License.
|
| 8 |
+
# You may obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 15 |
+
# See the License for the specific language governing permissions and
|
| 16 |
+
# limitations under the License.
|
| 17 |
+
|
| 18 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 19 |
+
from transformers.modeling_rope_utils import rope_config_validation
|
| 20 |
+
from transformers.utils import logging
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
logger = logging.get_logger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
_MIMOV2_ATTENTION_PROJECTION_LAYOUTS = {"split", "fused_qkv"}
|
| 27 |
+
|
| 28 |
+
_MIMOV2_SPLIT_TP_PLAN = {
|
| 29 |
+
"layers.*.self_attn.q_proj": "colwise",
|
| 30 |
+
"layers.*.self_attn.k_proj": "colwise",
|
| 31 |
+
"layers.*.self_attn.v_proj": "colwise",
|
| 32 |
+
"layers.*.self_attn.o_proj": "rowwise",
|
| 33 |
+
"layers.*.mlp.gate_proj": "colwise",
|
| 34 |
+
"layers.*.mlp.up_proj": "colwise",
|
| 35 |
+
"layers.*.mlp.down_proj": "rowwise",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
_MIMOV2_FUSED_QKV_TP_PLAN = {
|
| 39 |
+
"layers.*.self_attn.qkv_proj": "colwise",
|
| 40 |
+
"layers.*.self_attn.o_proj": "rowwise",
|
| 41 |
+
"layers.*.mlp.gate_proj": "colwise",
|
| 42 |
+
"layers.*.mlp.up_proj": "colwise",
|
| 43 |
+
"layers.*.mlp.down_proj": "rowwise",
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
_MIMOV2_PP_PLAN = {
|
| 47 |
+
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
| 48 |
+
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
| 49 |
+
"norm": (["hidden_states"], ["hidden_states"]),
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class MiMoV2Config(PretrainedConfig):
|
| 54 |
+
|
| 55 |
+
model_type = "mimo_v2"
|
| 56 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 57 |
+
|
| 58 |
+
base_model_tp_plan = _MIMOV2_SPLIT_TP_PLAN
|
| 59 |
+
base_model_pp_plan = _MIMOV2_PP_PLAN
|
| 60 |
+
|
| 61 |
+
attribute_map = {
|
| 62 |
+
"num_local_experts": "n_routed_experts",
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
def __init__(
|
| 66 |
+
self,
|
| 67 |
+
vocab_size=151936,
|
| 68 |
+
hidden_size=4096,
|
| 69 |
+
intermediate_size=22016,
|
| 70 |
+
num_hidden_layers=32,
|
| 71 |
+
num_attention_heads=32,
|
| 72 |
+
num_key_value_heads=32,
|
| 73 |
+
hidden_act="silu",
|
| 74 |
+
max_position_embeddings=32768,
|
| 75 |
+
initializer_range=0.02,
|
| 76 |
+
layernorm_epsilon=1e-6,
|
| 77 |
+
use_cache=True,
|
| 78 |
+
tie_word_embeddings=False,
|
| 79 |
+
rope_theta=10000.0,
|
| 80 |
+
rope_scaling=None,
|
| 81 |
+
attention_dropout=0.0,
|
| 82 |
+
attention_bias=False,
|
| 83 |
+
attention_value_scale=None,
|
| 84 |
+
head_dim=None,
|
| 85 |
+
v_head_dim=None,
|
| 86 |
+
swa_num_attention_heads=None,
|
| 87 |
+
swa_num_key_value_heads=None,
|
| 88 |
+
swa_head_dim=None,
|
| 89 |
+
swa_v_head_dim=None,
|
| 90 |
+
swa_rope_theta=None,
|
| 91 |
+
sliding_window=None,
|
| 92 |
+
sliding_window_size=None,
|
| 93 |
+
add_full_attention_sink_bias=False,
|
| 94 |
+
add_swa_attention_sink_bias=False,
|
| 95 |
+
hybrid_block_size=None,
|
| 96 |
+
hybrid_layer_pattern=None,
|
| 97 |
+
partial_rotary_factor=1.0,
|
| 98 |
+
n_routed_experts=None,
|
| 99 |
+
moe_intermediate_size=None,
|
| 100 |
+
num_experts_per_tok=None,
|
| 101 |
+
routed_scaling_factor=None,
|
| 102 |
+
scoring_func="sigmoid",
|
| 103 |
+
topk_method="noaux_tc",
|
| 104 |
+
n_group=None,
|
| 105 |
+
topk_group=None,
|
| 106 |
+
norm_topk_prob=True,
|
| 107 |
+
moe_layer_freq=None,
|
| 108 |
+
attention_projection_layout="split",
|
| 109 |
+
**kwargs,
|
| 110 |
+
):
|
| 111 |
+
rope_parameters = kwargs.pop("rope_parameters", None)
|
| 112 |
+
if rope_scaling is None and rope_parameters is not None:
|
| 113 |
+
rope_scaling = rope_parameters
|
| 114 |
+
|
| 115 |
+
if attention_projection_layout is None:
|
| 116 |
+
attention_projection_layout = "split"
|
| 117 |
+
if attention_projection_layout not in _MIMOV2_ATTENTION_PROJECTION_LAYOUTS:
|
| 118 |
+
raise ValueError(f"Unsupported MiMoV2 attention projection layout: {attention_projection_layout}")
|
| 119 |
+
|
| 120 |
+
self.attention_projection_layout = attention_projection_layout
|
| 121 |
+
self.base_model_tp_plan = (
|
| 122 |
+
_MIMOV2_FUSED_QKV_TP_PLAN.copy()
|
| 123 |
+
if attention_projection_layout == "fused_qkv"
|
| 124 |
+
else _MIMOV2_SPLIT_TP_PLAN.copy()
|
| 125 |
+
)
|
| 126 |
+
self.base_model_pp_plan = _MIMOV2_PP_PLAN.copy()
|
| 127 |
+
|
| 128 |
+
self.vocab_size = vocab_size
|
| 129 |
+
self.max_position_embeddings = max_position_embeddings
|
| 130 |
+
self.hidden_size = hidden_size
|
| 131 |
+
self.intermediate_size = intermediate_size
|
| 132 |
+
self.num_hidden_layers = num_hidden_layers
|
| 133 |
+
self.num_attention_heads = num_attention_heads
|
| 134 |
+
|
| 135 |
+
if num_key_value_heads is None:
|
| 136 |
+
num_key_value_heads = num_attention_heads
|
| 137 |
+
if num_attention_heads % num_key_value_heads != 0:
|
| 138 |
+
raise ValueError("num_attention_heads must be divisible by num_key_value_heads")
|
| 139 |
+
|
| 140 |
+
self.num_key_value_heads = num_key_value_heads
|
| 141 |
+
self.hidden_act = hidden_act
|
| 142 |
+
self.initializer_range = initializer_range
|
| 143 |
+
self.layernorm_epsilon = layernorm_epsilon
|
| 144 |
+
self.use_cache = use_cache
|
| 145 |
+
self.rope_theta = rope_theta
|
| 146 |
+
self.rope_scaling = rope_scaling
|
| 147 |
+
self.attention_dropout = attention_dropout
|
| 148 |
+
self.attention_bias = attention_bias
|
| 149 |
+
self.attention_value_scale = attention_value_scale
|
| 150 |
+
|
| 151 |
+
self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
|
| 152 |
+
self.v_head_dim = v_head_dim if v_head_dim is not None else self.head_dim
|
| 153 |
+
self.swa_num_attention_heads = (
|
| 154 |
+
swa_num_attention_heads if swa_num_attention_heads is not None else num_attention_heads
|
| 155 |
+
)
|
| 156 |
+
self.swa_num_key_value_heads = (
|
| 157 |
+
swa_num_key_value_heads if swa_num_key_value_heads is not None else num_key_value_heads
|
| 158 |
+
)
|
| 159 |
+
if self.swa_num_attention_heads % self.swa_num_key_value_heads != 0:
|
| 160 |
+
raise ValueError("swa_num_attention_heads must be divisible by swa_num_key_value_heads")
|
| 161 |
+
self.swa_head_dim = swa_head_dim if swa_head_dim is not None else self.head_dim
|
| 162 |
+
self.swa_v_head_dim = swa_v_head_dim if swa_v_head_dim is not None else self.swa_head_dim
|
| 163 |
+
self.swa_rope_theta = swa_rope_theta if swa_rope_theta is not None else rope_theta
|
| 164 |
+
|
| 165 |
+
if sliding_window is None:
|
| 166 |
+
sliding_window = sliding_window_size
|
| 167 |
+
self.sliding_window = sliding_window
|
| 168 |
+
self.sliding_window_size = sliding_window_size if sliding_window_size is not None else sliding_window
|
| 169 |
+
self.add_full_attention_sink_bias = add_full_attention_sink_bias
|
| 170 |
+
self.add_swa_attention_sink_bias = add_swa_attention_sink_bias
|
| 171 |
+
|
| 172 |
+
if hybrid_block_size is not None and hybrid_layer_pattern is None:
|
| 173 |
+
hybrid_layer_pattern = [0 if ((i + 1) % hybrid_block_size == 0) else 1 for i in range(num_hidden_layers)]
|
| 174 |
+
elif hybrid_layer_pattern is None:
|
| 175 |
+
hybrid_layer_pattern = [0] * num_hidden_layers
|
| 176 |
+
if len(hybrid_layer_pattern) != num_hidden_layers:
|
| 177 |
+
raise ValueError("hybrid_layer_pattern length must match num_hidden_layers")
|
| 178 |
+
self.hybrid_block_size = hybrid_block_size
|
| 179 |
+
self.hybrid_layer_pattern = hybrid_layer_pattern
|
| 180 |
+
|
| 181 |
+
self.partial_rotary_factor = partial_rotary_factor
|
| 182 |
+
|
| 183 |
+
self.n_routed_experts = n_routed_experts
|
| 184 |
+
self.moe_intermediate_size = moe_intermediate_size if moe_intermediate_size is not None else intermediate_size
|
| 185 |
+
self.num_experts_per_tok = num_experts_per_tok
|
| 186 |
+
self.routed_scaling_factor = routed_scaling_factor
|
| 187 |
+
self.scoring_func = scoring_func
|
| 188 |
+
self.topk_method = topk_method
|
| 189 |
+
self.n_group = n_group
|
| 190 |
+
self.topk_group = topk_group
|
| 191 |
+
self.norm_topk_prob = norm_topk_prob
|
| 192 |
+
if isinstance(moe_layer_freq, int):
|
| 193 |
+
moe_layer_freq = [moe_layer_freq > 0 and i % moe_layer_freq == 0 for i in range(num_hidden_layers)]
|
| 194 |
+
elif moe_layer_freq is None:
|
| 195 |
+
moe_layer_freq = [False] * num_hidden_layers
|
| 196 |
+
if len(moe_layer_freq) != num_hidden_layers:
|
| 197 |
+
raise ValueError("moe_layer_freq length must match num_hidden_layers")
|
| 198 |
+
self.moe_layer_freq = moe_layer_freq
|
| 199 |
+
|
| 200 |
+
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
| 201 |
+
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
| 202 |
+
rope_config_validation(self)
|
| 203 |
+
|
| 204 |
+
super().__init__(
|
| 205 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 206 |
+
**kwargs,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
__all__ = ["MiMoV2Config"]
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors.index.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7ad2187407f8cf989f140a42d9bf340be7a220b2f6b35afa0a728774eb20f4b
|
| 3 |
+
size 15569001
|
model_mtp.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0f1599b41996feab60b2b2ea42338b26979a2b58b115526bb49371a91e3c8d8
|
| 3 |
+
size 2463641280
|
model_pp0_ep0_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3aac22ecd430136c92626bbf1345dc9e99d9800a4cd49878793b52a6154cd40
|
| 3 |
+
size 34554911640
|
model_pp0_ep0_shard1.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6422d714cb680c72ebc5982b95252f5457e7f4625ceb29fad5e05aecc10c3b57
|
| 3 |
+
size 27180576088
|
model_pp0_ep10_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf18b588ea10bead2879179f15bbca4e91e9abd59863581e2cd9004470d2d54d
|
| 3 |
+
size 31264218696
|
model_pp0_ep11_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a0b5d0ef3523bd3ce8fe2eab1bb226ce5122bf72fb30cf7bbe6e8f99459bb90
|
| 3 |
+
size 31264218696
|
model_pp0_ep12_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8f63229280884fc677945fee04391246c3d4d7aacf90f327ef430ab62bc72c6
|
| 3 |
+
size 31264218696
|
model_pp0_ep13_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93704b69e4f0e0fa148e2118f605518fddb62dbda17d4575918aa5f075dfb890
|
| 3 |
+
size 31264218696
|
model_pp0_ep14_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ebf87de52e82194428200f82e28c666bf42639c43d32b12218041ec67e4dccb
|
| 3 |
+
size 31264218696
|
model_pp0_ep15_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ffbe58cf0bf67a61481ea7d5bb6f6dd310c546c5bed21ff45db264fbb5ffca46
|
| 3 |
+
size 31264218696
|
model_pp0_ep16_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d5028da852f2ae339e047deb0f4dbdcb5770ffbc411bc130947b40b6cc7c0f4
|
| 3 |
+
size 31264218696
|
model_pp0_ep17_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3ac44fff1a24c635d28ce4addc84589cb6638d6284f2e17a502b4b23dc0d9fa
|
| 3 |
+
size 31264218696
|
model_pp0_ep18_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:701682b0e073f471a47d3a61b1adb13d7e1183eacd053be2aafd6e33c2f1efb9
|
| 3 |
+
size 31264218696
|
model_pp0_ep19_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:653d5ed28d8320369708a145b209d0fdb6cb03673351a1cc92eff163dfe6757e
|
| 3 |
+
size 31264218696
|
model_pp0_ep1_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ae084dd12811777d1da10784235d5014a21a6b22fe936e70d0ab4682c336d7
|
| 3 |
+
size 31264213728
|
model_pp0_ep20_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ae1f90c36626cb3944ed4986f7df3181b761be5658e9cd561691dab946b1f28
|
| 3 |
+
size 31264218696
|
model_pp0_ep21_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25954cf458b4e149523d7f436ef404d51c6cc7d4fa6d2a12bfaa9e4fb7fb147c
|
| 3 |
+
size 31264218696
|
model_pp0_ep22_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb181bc5d65f77090b8423503c31895ed122d041bb6cf6d4a7b7846744d6be05
|
| 3 |
+
size 31264218696
|
model_pp0_ep23_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f67f2343e3b92066428bea0fb39eccc68d3f587ff1ad274478e3d369120fb61b
|
| 3 |
+
size 31264218696
|
model_pp0_ep24_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:588c6199211b4b947b53ce0fa93df4597a2da0090a4ca637e69e70428ef1780c
|
| 3 |
+
size 31264218696
|
model_pp0_ep25_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c845e96be226275597955cfe995ee41fa6b7eafd3a2e579595efadaca961c8f0
|
| 3 |
+
size 31264218696
|
model_pp0_ep26_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30f987f808058ba27447d8200e5c0d41df64a2d9525159f8ab9c5f635d69509a
|
| 3 |
+
size 31264218696
|
model_pp0_ep27_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b151aeb375b84ba967792207e8bd4e0c7ede979534f8648d3f5fa2bb88659ac
|
| 3 |
+
size 31264218696
|
model_pp0_ep28_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2538741e85b3430549f389be672d8cda5384f88f0f92af9d83442c37936ddd14
|
| 3 |
+
size 31264218696
|
model_pp0_ep29_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf56b7502b8aabd8d0779978e9ad03f4905715c2654cd9c1218571ea76f28a25
|
| 3 |
+
size 31264218696
|
model_pp0_ep2_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4df6ddf8e6c3ced5d78f11a47c0f425b898cdb92acb31675bbb93de6aeeb52f6
|
| 3 |
+
size 31264213728
|
model_pp0_ep30_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e3aa954fa6729a12a773800fdfbd5a71dae18e30e4a4b7177d430173d541152
|
| 3 |
+
size 31264218696
|
model_pp0_ep31_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc4350c6f373209df4f49755ae06ede1a1c1ebed2f1fc98778e71c2b180de480
|
| 3 |
+
size 31264218696
|
model_pp0_ep3_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3696b210440cfe93b533e0945263203d917e01a9b8e8921845b5e262a69875de
|
| 3 |
+
size 31264213728
|
model_pp0_ep4_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b042d23818e9d537e48b63cfa437ddc2be152f2a588fc5bd678acf1c2f67bc51
|
| 3 |
+
size 31264213728
|
model_pp0_ep5_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd6e5884dae5be62fdcd1bf843281c7f7dbd671f461e57d0764cf1803afd5aa4
|
| 3 |
+
size 31264213728
|
model_pp0_ep6_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbde11cae32b38026481129d053b976842958da86f40abf91fbcd91387b9ae67
|
| 3 |
+
size 31264213728
|
model_pp0_ep7_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ff4bc2f37ab4496349874eff47ed0b253aab854983c597b040d6912bfa730eb
|
| 3 |
+
size 31264213728
|
model_pp0_ep8_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bf1dc12b959388fb6355718d9217457f8564a35145c52200e778952542479e1
|
| 3 |
+
size 31264217040
|
model_pp0_ep9_shard0.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1d7ce5137511e43470781806571b3575246c55e69dd22d05acb85a603133b1a
|
| 3 |
+
size 31264218696
|
modeling_mimo_v2.py
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
#
|
| 3 |
+
# Copyright 2026 Xiaomi Corporation.
|
| 4 |
+
# Copyright 2026 The HuggingFace Inc. team.
|
| 5 |
+
#
|
| 6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 7 |
+
# you may not use this file except in compliance with the License.
|
| 8 |
+
# You may obtain a copy of the License at
|
| 9 |
+
#
|
| 10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 11 |
+
#
|
| 12 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 15 |
+
# See the License for the specific language governing permissions and
|
| 16 |
+
# limitations under the License.
|
| 17 |
+
|
| 18 |
+
from copy import copy
|
| 19 |
+
from typing import Callable, Optional, Union
|
| 20 |
+
|
| 21 |
+
import torch
|
| 22 |
+
import torch.nn as nn
|
| 23 |
+
import torch.nn.functional as F
|
| 24 |
+
|
| 25 |
+
from transformers.activations import ACT2FN
|
| 26 |
+
from transformers.cache_utils import Cache, DynamicCache
|
| 27 |
+
from transformers.generation import GenerationMixin
|
| 28 |
+
from transformers.integrations import use_kernel_forward_from_hub
|
| 29 |
+
from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
| 30 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
| 31 |
+
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
| 32 |
+
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
| 33 |
+
from transformers.processing_utils import Unpack
|
| 34 |
+
from transformers.utils import TransformersKwargs, can_return_tuple, logging
|
| 35 |
+
|
| 36 |
+
from .configuration_mimo_v2 import MiMoV2Config
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
logger = logging.get_logger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def rotate_half(x):
|
| 43 |
+
"""Rotates half the hidden dims of the input."""
|
| 44 |
+
x1 = x[..., : x.shape[-1] // 2]
|
| 45 |
+
x2 = x[..., x.shape[-1] // 2 :]
|
| 46 |
+
return torch.cat((-x2, x1), dim=-1)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
| 50 |
+
"""Applies rotary position embedding to query and key tensors."""
|
| 51 |
+
cos = cos.unsqueeze(unsqueeze_dim)
|
| 52 |
+
sin = sin.unsqueeze(unsqueeze_dim)
|
| 53 |
+
q_embed = (q * cos) + (rotate_half(q) * sin)
|
| 54 |
+
k_embed = (k * cos) + (rotate_half(k) * sin)
|
| 55 |
+
return q_embed, k_embed
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
| 59 |
+
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
| 60 |
+
if n_rep == 1:
|
| 61 |
+
return hidden_states
|
| 62 |
+
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
|
| 63 |
+
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def eager_attention_forward(
|
| 67 |
+
module: nn.Module,
|
| 68 |
+
query: torch.Tensor,
|
| 69 |
+
key: torch.Tensor,
|
| 70 |
+
value: torch.Tensor,
|
| 71 |
+
attention_mask: Optional[torch.Tensor],
|
| 72 |
+
scaling: float,
|
| 73 |
+
dropout: float = 0.0,
|
| 74 |
+
sinks: Optional[torch.Tensor] = None,
|
| 75 |
+
**kwargs,
|
| 76 |
+
):
|
| 77 |
+
key_states = repeat_kv(key, module.num_key_value_groups)
|
| 78 |
+
value_states = repeat_kv(value, module.num_key_value_groups)
|
| 79 |
+
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
|
| 80 |
+
if attention_mask is not None:
|
| 81 |
+
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
|
| 82 |
+
attn_weights = attn_weights + causal_mask
|
| 83 |
+
|
| 84 |
+
if sinks is not None:
|
| 85 |
+
sinks = module.attention_sink_bias.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
|
| 86 |
+
attn_weights = torch.cat([attn_weights, sinks], dim=-1)
|
| 87 |
+
|
| 88 |
+
attn_weights = attn_weights - attn_weights.max(dim=-1, keepdim=True).values
|
| 89 |
+
probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
|
| 90 |
+
|
| 91 |
+
if sinks is not None:
|
| 92 |
+
probs = probs[..., :-1]
|
| 93 |
+
|
| 94 |
+
attn_weights = nn.functional.dropout(probs, p=dropout, training=module.training)
|
| 95 |
+
attn_output = torch.matmul(attn_weights, value_states)
|
| 96 |
+
attn_output = attn_output.transpose(1, 2).contiguous()
|
| 97 |
+
return attn_output, attn_weights
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@use_kernel_forward_from_hub("RMSNorm")
|
| 101 |
+
class MiMoV2RMSNorm(nn.Module):
|
| 102 |
+
def __init__(self, hidden_size, eps=1e-6):
|
| 103 |
+
super().__init__()
|
| 104 |
+
self.weight = nn.Parameter(torch.ones(hidden_size))
|
| 105 |
+
self.variance_epsilon = eps
|
| 106 |
+
|
| 107 |
+
def forward(self, hidden_states):
|
| 108 |
+
input_dtype = hidden_states.dtype
|
| 109 |
+
hidden_states = hidden_states.to(torch.float32)
|
| 110 |
+
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
| 111 |
+
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
| 112 |
+
return self.weight * hidden_states.to(input_dtype)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class MiMoV2MLP(nn.Module):
|
| 116 |
+
def __init__(self, config, intermediate_size=None):
|
| 117 |
+
super().__init__()
|
| 118 |
+
self.config = config
|
| 119 |
+
self.hidden_size = config.hidden_size
|
| 120 |
+
self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
|
| 121 |
+
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
| 122 |
+
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
| 123 |
+
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
| 124 |
+
self.act_fn = ACT2FN[config.hidden_act]
|
| 125 |
+
|
| 126 |
+
def forward(self, hidden_states):
|
| 127 |
+
return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class MiMoV2MoEGate(nn.Module):
|
| 131 |
+
def __init__(self, config):
|
| 132 |
+
super().__init__()
|
| 133 |
+
self.config = config
|
| 134 |
+
self.top_k = config.num_experts_per_tok
|
| 135 |
+
self.n_routed_experts = config.n_routed_experts
|
| 136 |
+
self.routed_scaling_factor = config.routed_scaling_factor if config.routed_scaling_factor is not None else 1.0
|
| 137 |
+
self.scoring_func = config.scoring_func
|
| 138 |
+
self.topk_method = config.topk_method
|
| 139 |
+
self.n_group = config.n_group
|
| 140 |
+
self.topk_group = config.topk_group
|
| 141 |
+
self.norm_topk_prob = config.norm_topk_prob
|
| 142 |
+
self.gating_dim = config.hidden_size
|
| 143 |
+
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
|
| 144 |
+
if self.topk_method == "noaux_tc":
|
| 145 |
+
self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
|
| 146 |
+
|
| 147 |
+
def forward(self, hidden_states):
|
| 148 |
+
bsz, seq_len, h = hidden_states.shape
|
| 149 |
+
hidden_states = hidden_states.view(-1, h)
|
| 150 |
+
logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
|
| 151 |
+
if self.scoring_func == "sigmoid":
|
| 152 |
+
scores = logits.sigmoid()
|
| 153 |
+
else:
|
| 154 |
+
raise NotImplementedError(f"Unsupported scoring function for MoE gating: {self.scoring_func}")
|
| 155 |
+
|
| 156 |
+
if self.topk_method == "noaux_tc":
|
| 157 |
+
if self.training:
|
| 158 |
+
raise ValueError("MiMoV2 noaux_tc routing is only implemented for inference.")
|
| 159 |
+
scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
|
| 160 |
+
group_scores = scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
|
| 161 |
+
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
|
| 162 |
+
group_mask = torch.zeros_like(group_scores)
|
| 163 |
+
group_mask.scatter_(1, group_idx, 1)
|
| 164 |
+
score_mask = (
|
| 165 |
+
group_mask.unsqueeze(-1)
|
| 166 |
+
.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
|
| 167 |
+
.reshape(bsz * seq_len, -1)
|
| 168 |
+
)
|
| 169 |
+
tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))
|
| 170 |
+
_, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
|
| 171 |
+
topk_weight = scores.gather(1, topk_idx)
|
| 172 |
+
else:
|
| 173 |
+
raise NotImplementedError(f"Unsupported TopK function for MoE gating: {self.topk_method}")
|
| 174 |
+
|
| 175 |
+
if self.top_k > 1 and self.norm_topk_prob:
|
| 176 |
+
denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
|
| 177 |
+
topk_weight = topk_weight / denominator
|
| 178 |
+
topk_weight = topk_weight * self.routed_scaling_factor
|
| 179 |
+
return topk_idx, topk_weight
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class MiMoV2MoE(nn.Module):
|
| 183 |
+
def __init__(self, config):
|
| 184 |
+
super().__init__()
|
| 185 |
+
self.config = config
|
| 186 |
+
self.experts = nn.ModuleList(
|
| 187 |
+
[MiMoV2MLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(config.n_routed_experts)]
|
| 188 |
+
)
|
| 189 |
+
self.gate = MiMoV2MoEGate(config)
|
| 190 |
+
|
| 191 |
+
def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
|
| 192 |
+
final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
|
| 193 |
+
expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
|
| 194 |
+
expert_mask = expert_mask.permute(2, 0, 1)
|
| 195 |
+
|
| 196 |
+
for expert_idx, expert in enumerate(self.experts):
|
| 197 |
+
mask = expert_mask[expert_idx]
|
| 198 |
+
token_indices, weight_indices = torch.where(mask)
|
| 199 |
+
if token_indices.numel() > 0:
|
| 200 |
+
expert_weights = topk_weights[token_indices, weight_indices]
|
| 201 |
+
expert_input = hidden_states[token_indices]
|
| 202 |
+
expert_output = expert(expert_input)
|
| 203 |
+
final_hidden_states.index_add_(0, token_indices, expert_output * expert_weights.unsqueeze(-1))
|
| 204 |
+
|
| 205 |
+
return final_hidden_states.type(hidden_states.dtype)
|
| 206 |
+
|
| 207 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 208 |
+
orig_shape = hidden_states.shape
|
| 209 |
+
topk_indices, topk_weights = self.gate(hidden_states)
|
| 210 |
+
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
| 211 |
+
hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
|
| 212 |
+
return hidden_states
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
class MiMoV2Attention(nn.Module):
|
| 216 |
+
"""MiMoV2 attention.
|
| 217 |
+
|
| 218 |
+
`projection_layout` only controls how checkpoint weights are named and
|
| 219 |
+
stored: Flash uses separate q/k/v projections, while Pro uses fused qkv.
|
| 220 |
+
The attention computation after projection is shared.
|
| 221 |
+
"""
|
| 222 |
+
|
| 223 |
+
def __init__(self, config, is_swa: bool, layer_idx: int, projection_layout: str = "split"):
|
| 224 |
+
super().__init__()
|
| 225 |
+
if projection_layout not in {"split", "fused_qkv"}:
|
| 226 |
+
raise ValueError(f"Unsupported MiMoV2 attention projection layout: {projection_layout}")
|
| 227 |
+
|
| 228 |
+
self.config = config
|
| 229 |
+
self.layer_idx = layer_idx
|
| 230 |
+
self.is_swa = is_swa
|
| 231 |
+
self.is_causal = True
|
| 232 |
+
self.projection_layout = projection_layout
|
| 233 |
+
|
| 234 |
+
default_head_dim = config.hidden_size // config.num_attention_heads
|
| 235 |
+
default_v_head_dim = getattr(config, "v_head_dim", default_head_dim)
|
| 236 |
+
|
| 237 |
+
if is_swa:
|
| 238 |
+
self.head_dim = getattr(config, "swa_head_dim", getattr(config, "head_dim", default_head_dim))
|
| 239 |
+
self.v_head_dim = getattr(config, "swa_v_head_dim", default_v_head_dim)
|
| 240 |
+
self.num_attention_heads = getattr(config, "swa_num_attention_heads", config.num_attention_heads)
|
| 241 |
+
self.num_key_value_heads = getattr(config, "swa_num_key_value_heads", config.num_key_value_heads)
|
| 242 |
+
else:
|
| 243 |
+
self.head_dim = getattr(config, "head_dim", default_head_dim)
|
| 244 |
+
self.v_head_dim = getattr(config, "v_head_dim", self.head_dim)
|
| 245 |
+
self.num_attention_heads = config.num_attention_heads
|
| 246 |
+
self.num_key_value_heads = config.num_key_value_heads
|
| 247 |
+
|
| 248 |
+
self.rope_dim = int(self.head_dim * getattr(config, "partial_rotary_factor", 1.0))
|
| 249 |
+
if self.rope_dim % 2 != 0:
|
| 250 |
+
raise ValueError(
|
| 251 |
+
f"MiMoV2 rotary dimension must be even, got {self.rope_dim} from "
|
| 252 |
+
f"head_dim={self.head_dim} and partial_rotary_factor={getattr(config, 'partial_rotary_factor', 1.0)}"
|
| 253 |
+
)
|
| 254 |
+
self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
|
| 255 |
+
self.attention_dropout = getattr(config, "attention_dropout", 0.0)
|
| 256 |
+
self.scaling = self.head_dim**-0.5
|
| 257 |
+
self.sliding_window = getattr(config, "sliding_window", None) if is_swa else None
|
| 258 |
+
self.q_size = self.num_attention_heads * self.head_dim
|
| 259 |
+
self.k_size = self.num_key_value_heads * self.head_dim
|
| 260 |
+
self.v_size = self.num_key_value_heads * self.v_head_dim
|
| 261 |
+
self.o_hidden_size = self.num_attention_heads * self.v_head_dim
|
| 262 |
+
self.v_scale = getattr(config, "attention_value_scale", None)
|
| 263 |
+
self.attention_sink_bias = (
|
| 264 |
+
nn.Parameter(torch.empty(self.num_attention_heads), requires_grad=False)
|
| 265 |
+
if (
|
| 266 |
+
(getattr(config, "add_full_attention_sink_bias", False) and not is_swa)
|
| 267 |
+
or (getattr(config, "add_swa_attention_sink_bias", False) and is_swa)
|
| 268 |
+
)
|
| 269 |
+
else None
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
attention_bias = getattr(config, "attention_bias", False)
|
| 273 |
+
if self.projection_layout == "fused_qkv":
|
| 274 |
+
self.qkv_proj = nn.Linear(
|
| 275 |
+
config.hidden_size,
|
| 276 |
+
self.q_size + self.k_size + self.v_size,
|
| 277 |
+
bias=attention_bias,
|
| 278 |
+
)
|
| 279 |
+
else:
|
| 280 |
+
self.q_proj = nn.Linear(config.hidden_size, self.q_size, bias=attention_bias)
|
| 281 |
+
self.k_proj = nn.Linear(config.hidden_size, self.k_size, bias=attention_bias)
|
| 282 |
+
self.v_proj = nn.Linear(config.hidden_size, self.v_size, bias=attention_bias)
|
| 283 |
+
self.o_proj = nn.Linear(self.o_hidden_size, config.hidden_size, bias=False)
|
| 284 |
+
|
| 285 |
+
def _forward_attention(
|
| 286 |
+
self,
|
| 287 |
+
query_states: torch.Tensor,
|
| 288 |
+
key_states: torch.Tensor,
|
| 289 |
+
value_states: torch.Tensor,
|
| 290 |
+
input_shape: torch.Size,
|
| 291 |
+
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
| 292 |
+
attention_mask: Optional[torch.Tensor],
|
| 293 |
+
past_key_values: Optional[Cache] = None,
|
| 294 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 295 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 296 |
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
| 297 |
+
if self.v_scale is not None:
|
| 298 |
+
value_states = value_states * self.v_scale
|
| 299 |
+
|
| 300 |
+
cos, sin = position_embeddings
|
| 301 |
+
query_rope, query_nope = query_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
|
| 302 |
+
key_rope, key_nope = key_states.split([self.rope_dim, self.head_dim - self.rope_dim], dim=-1)
|
| 303 |
+
query_rope, key_rope = apply_rotary_pos_emb(query_rope, key_rope, cos, sin)
|
| 304 |
+
query_states = torch.cat([query_rope, query_nope], dim=-1)
|
| 305 |
+
key_states = torch.cat([key_rope, key_nope], dim=-1)
|
| 306 |
+
|
| 307 |
+
if past_key_values is not None:
|
| 308 |
+
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
| 309 |
+
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
| 310 |
+
|
| 311 |
+
attn_implementation = self.config._attn_implementation
|
| 312 |
+
if attn_implementation is not None and attn_implementation.startswith("paged|"):
|
| 313 |
+
raise ValueError(
|
| 314 |
+
"MiMoV2 remote code does not support paged attention cache. "
|
| 315 |
+
"Please use eager, sdpa, flex_attention, or flash_attention_2."
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
| 319 |
+
attn_implementation, eager_attention_forward
|
| 320 |
+
)
|
| 321 |
+
if self.attention_sink_bias is not None and attn_implementation == "sdpa":
|
| 322 |
+
logger.warning_once(
|
| 323 |
+
"MiMoV2 attention sink bias is not supported by SDPA; falling back to eager attention for correctness."
|
| 324 |
+
)
|
| 325 |
+
attention_interface = eager_attention_forward
|
| 326 |
+
|
| 327 |
+
attention_kwargs = {
|
| 328 |
+
"dropout": 0.0 if not self.training else self.attention_dropout,
|
| 329 |
+
"scaling": self.scaling,
|
| 330 |
+
"position_ids": position_ids,
|
| 331 |
+
"is_causal": self.is_causal,
|
| 332 |
+
}
|
| 333 |
+
if attention_interface is eager_attention_forward:
|
| 334 |
+
attention_kwargs["sinks"] = self.attention_sink_bias
|
| 335 |
+
else:
|
| 336 |
+
if self.attention_sink_bias is not None:
|
| 337 |
+
attention_kwargs["s_aux"] = self.attention_sink_bias
|
| 338 |
+
if self.sliding_window is not None:
|
| 339 |
+
attention_kwargs["sliding_window"] = self.sliding_window
|
| 340 |
+
|
| 341 |
+
attn_output, attn_weights = attention_interface(
|
| 342 |
+
self,
|
| 343 |
+
query_states,
|
| 344 |
+
key_states,
|
| 345 |
+
value_states,
|
| 346 |
+
attention_mask,
|
| 347 |
+
**attention_kwargs,
|
| 348 |
+
)
|
| 349 |
+
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
| 350 |
+
attn_output = self.o_proj(attn_output)
|
| 351 |
+
return attn_output, attn_weights
|
| 352 |
+
|
| 353 |
+
def forward(
|
| 354 |
+
self,
|
| 355 |
+
hidden_states: torch.Tensor,
|
| 356 |
+
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
| 357 |
+
attention_mask: Optional[torch.Tensor],
|
| 358 |
+
past_key_values: Optional[Cache] = None,
|
| 359 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 360 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 361 |
+
**kwargs: Unpack[TransformersKwargs],
|
| 362 |
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
| 363 |
+
input_shape = hidden_states.shape[:-1]
|
| 364 |
+
|
| 365 |
+
if self.projection_layout == "fused_qkv":
|
| 366 |
+
qkv_states = self.qkv_proj(hidden_states)
|
| 367 |
+
query_states, key_states, value_states = qkv_states.split([self.q_size, self.k_size, self.v_size], dim=-1)
|
| 368 |
+
else:
|
| 369 |
+
query_states = self.q_proj(hidden_states)
|
| 370 |
+
key_states = self.k_proj(hidden_states)
|
| 371 |
+
value_states = self.v_proj(hidden_states)
|
| 372 |
+
|
| 373 |
+
query_states = query_states.view(*input_shape, self.num_attention_heads, self.head_dim).transpose(1, 2)
|
| 374 |
+
key_states = key_states.view(*input_shape, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
| 375 |
+
value_states = value_states.view(*input_shape, self.num_key_value_heads, self.v_head_dim).transpose(1, 2)
|
| 376 |
+
return self._forward_attention(
|
| 377 |
+
query_states,
|
| 378 |
+
key_states,
|
| 379 |
+
value_states,
|
| 380 |
+
input_shape,
|
| 381 |
+
position_embeddings,
|
| 382 |
+
attention_mask,
|
| 383 |
+
past_key_values=past_key_values,
|
| 384 |
+
cache_position=cache_position,
|
| 385 |
+
position_ids=position_ids,
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
class MiMoV2DecoderLayer(nn.Module):
|
| 390 |
+
attention_projection_layout = "split"
|
| 391 |
+
|
| 392 |
+
def __init__(self, config, layer_idx: int, attention_projection_layout: Optional[str] = None):
|
| 393 |
+
super().__init__()
|
| 394 |
+
attention_projection_layout = attention_projection_layout or self.attention_projection_layout
|
| 395 |
+
is_swa_layer = config.hybrid_layer_pattern[layer_idx] == 1
|
| 396 |
+
self.attention_type = "sliding_window_attention" if is_swa_layer else "full_attention"
|
| 397 |
+
self.self_attn = MiMoV2Attention(
|
| 398 |
+
config, is_swa_layer, layer_idx, projection_layout=attention_projection_layout
|
| 399 |
+
)
|
| 400 |
+
self.mlp = (
|
| 401 |
+
MiMoV2MoE(config)
|
| 402 |
+
if getattr(config, "n_routed_experts", None) is not None and config.moe_layer_freq[layer_idx]
|
| 403 |
+
else MiMoV2MLP(config)
|
| 404 |
+
)
|
| 405 |
+
self.input_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
|
| 406 |
+
self.post_attention_layernorm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
|
| 407 |
+
|
| 408 |
+
def forward(
|
| 409 |
+
self,
|
| 410 |
+
hidden_states: torch.Tensor,
|
| 411 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 412 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 413 |
+
past_key_values: Optional[Cache] = None,
|
| 414 |
+
use_cache: Optional[bool] = False,
|
| 415 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 416 |
+
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
|
| 417 |
+
**kwargs: Unpack[TransformersKwargs],
|
| 418 |
+
) -> torch.Tensor:
|
| 419 |
+
residual = hidden_states
|
| 420 |
+
hidden_states = self.input_layernorm(hidden_states)
|
| 421 |
+
hidden_states, _ = self.self_attn(
|
| 422 |
+
hidden_states=hidden_states,
|
| 423 |
+
attention_mask=attention_mask,
|
| 424 |
+
position_ids=position_ids,
|
| 425 |
+
past_key_values=past_key_values,
|
| 426 |
+
use_cache=use_cache,
|
| 427 |
+
cache_position=cache_position,
|
| 428 |
+
position_embeddings=position_embeddings,
|
| 429 |
+
**kwargs,
|
| 430 |
+
)
|
| 431 |
+
hidden_states = residual + hidden_states
|
| 432 |
+
|
| 433 |
+
residual = hidden_states
|
| 434 |
+
hidden_states = self.post_attention_layernorm(hidden_states)
|
| 435 |
+
hidden_states = self.mlp(hidden_states)
|
| 436 |
+
hidden_states = residual + hidden_states
|
| 437 |
+
return hidden_states
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
class MiMoV2RotaryEmbedding(nn.Module):
|
| 441 |
+
inv_freq: torch.Tensor
|
| 442 |
+
|
| 443 |
+
def __init__(self, config, is_swa: bool, device=None):
|
| 444 |
+
super().__init__()
|
| 445 |
+
if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
|
| 446 |
+
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type", "default"))
|
| 447 |
+
else:
|
| 448 |
+
self.rope_type = "default"
|
| 449 |
+
self.max_seq_len_cached = config.max_position_embeddings
|
| 450 |
+
self.original_max_seq_len = config.max_position_embeddings
|
| 451 |
+
|
| 452 |
+
self.config = copy(config)
|
| 453 |
+
self.config.rope_parameters = copy(getattr(config, "rope_parameters", None) or {})
|
| 454 |
+
if is_swa:
|
| 455 |
+
self.config.rope_theta = getattr(config, "swa_rope_theta", config.rope_theta)
|
| 456 |
+
self.config.head_dim = getattr(config, "swa_head_dim", getattr(config, "head_dim", None))
|
| 457 |
+
if self.config.rope_parameters:
|
| 458 |
+
self.config.rope_parameters["rope_theta"] = self.config.rope_theta
|
| 459 |
+
self.rope_init_fn = (
|
| 460 |
+
self.compute_default_rope_parameters
|
| 461 |
+
if self.rope_type == "default"
|
| 462 |
+
else ROPE_INIT_FUNCTIONS[self.rope_type]
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
|
| 466 |
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
| 467 |
+
self.original_inv_freq = self.inv_freq
|
| 468 |
+
|
| 469 |
+
@staticmethod
|
| 470 |
+
def compute_default_rope_parameters(config, device=None, seq_len=None, layer_type=None):
|
| 471 |
+
config.standardize_rope_params()
|
| 472 |
+
rope_parameters = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters
|
| 473 |
+
base = rope_parameters["rope_theta"]
|
| 474 |
+
partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
|
| 475 |
+
head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
|
| 476 |
+
dim = int(head_dim * partial_rotary_factor)
|
| 477 |
+
if dim % 2 != 0:
|
| 478 |
+
raise ValueError(
|
| 479 |
+
f"MiMoV2 rotary dimension must be even, got {dim} from "
|
| 480 |
+
f"head_dim={head_dim} and partial_rotary_factor={partial_rotary_factor}"
|
| 481 |
+
)
|
| 482 |
+
inv_freq = 1.0 / (
|
| 483 |
+
base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
|
| 484 |
+
)
|
| 485 |
+
return inv_freq, 1.0
|
| 486 |
+
|
| 487 |
+
@torch.no_grad()
|
| 488 |
+
@dynamic_rope_update
|
| 489 |
+
def forward(self, x, position_ids):
|
| 490 |
+
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
|
| 491 |
+
position_ids_expanded = position_ids[:, None, :].float()
|
| 492 |
+
|
| 493 |
+
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
| 494 |
+
with torch.autocast(device_type=device_type, enabled=False):
|
| 495 |
+
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
| 496 |
+
emb = torch.cat((freqs, freqs), dim=-1)
|
| 497 |
+
cos = emb.cos() * self.attention_scaling
|
| 498 |
+
sin = emb.sin() * self.attention_scaling
|
| 499 |
+
|
| 500 |
+
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
class MiMoV2Model(PreTrainedModel):
|
| 504 |
+
config_class = MiMoV2Config
|
| 505 |
+
attention_projection_layout = "split"
|
| 506 |
+
|
| 507 |
+
def __init__(self, config):
|
| 508 |
+
super().__init__(config)
|
| 509 |
+
self.attention_projection_layout = getattr(
|
| 510 |
+
config, "attention_projection_layout", self.attention_projection_layout
|
| 511 |
+
)
|
| 512 |
+
self.vocab_size = config.vocab_size
|
| 513 |
+
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
|
| 514 |
+
self.layers = nn.ModuleList(
|
| 515 |
+
[
|
| 516 |
+
MiMoV2DecoderLayer(
|
| 517 |
+
config,
|
| 518 |
+
layer_idx,
|
| 519 |
+
attention_projection_layout=self.attention_projection_layout,
|
| 520 |
+
)
|
| 521 |
+
for layer_idx in range(config.num_hidden_layers)
|
| 522 |
+
]
|
| 523 |
+
)
|
| 524 |
+
self.norm = MiMoV2RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
|
| 525 |
+
self.rotary_emb = MiMoV2RotaryEmbedding(config=config, is_swa=False)
|
| 526 |
+
self.swa_rotary_emb = MiMoV2RotaryEmbedding(config=config, is_swa=True)
|
| 527 |
+
self.has_sliding_layers = any(pattern == 1 for pattern in config.hybrid_layer_pattern)
|
| 528 |
+
self.config.layer_types = [
|
| 529 |
+
"sliding_attention" if config.hybrid_layer_pattern[i] == 1 else "full_attention"
|
| 530 |
+
for i in range(config.num_hidden_layers)
|
| 531 |
+
]
|
| 532 |
+
self.post_init()
|
| 533 |
+
|
| 534 |
+
def get_input_embeddings(self):
|
| 535 |
+
return self.embed_tokens
|
| 536 |
+
|
| 537 |
+
def set_input_embeddings(self, value):
|
| 538 |
+
self.embed_tokens = value
|
| 539 |
+
|
| 540 |
+
def forward(
|
| 541 |
+
self,
|
| 542 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 543 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 544 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 545 |
+
past_key_values: Optional[Cache] = None,
|
| 546 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 547 |
+
use_cache: Optional[bool] = None,
|
| 548 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 549 |
+
**kwargs: Unpack[TransformersKwargs],
|
| 550 |
+
) -> BaseModelOutputWithPast:
|
| 551 |
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 552 |
+
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
| 553 |
+
|
| 554 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 555 |
+
|
| 556 |
+
if inputs_embeds is None:
|
| 557 |
+
inputs_embeds = self.embed_tokens(input_ids)
|
| 558 |
+
|
| 559 |
+
if use_cache and past_key_values is None:
|
| 560 |
+
past_key_values = DynamicCache(config=self.config)
|
| 561 |
+
|
| 562 |
+
if cache_position is None:
|
| 563 |
+
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
| 564 |
+
cache_position = torch.arange(
|
| 565 |
+
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
if position_ids is None:
|
| 569 |
+
position_ids = cache_position.unsqueeze(0)
|
| 570 |
+
|
| 571 |
+
if not isinstance(causal_mask_mapping := attention_mask, dict):
|
| 572 |
+
mask_kwargs = {
|
| 573 |
+
"config": self.config,
|
| 574 |
+
"input_embeds": inputs_embeds,
|
| 575 |
+
"attention_mask": attention_mask,
|
| 576 |
+
"cache_position": cache_position,
|
| 577 |
+
"past_key_values": past_key_values,
|
| 578 |
+
"position_ids": position_ids,
|
| 579 |
+
}
|
| 580 |
+
causal_mask_mapping = {
|
| 581 |
+
"full_attention": create_causal_mask(**mask_kwargs),
|
| 582 |
+
}
|
| 583 |
+
if self.has_sliding_layers:
|
| 584 |
+
if getattr(self.config, "sliding_window", None) is None:
|
| 585 |
+
raise ValueError("MiMoV2 config `sliding_window` must be set when hybrid_layer_pattern uses SWA.")
|
| 586 |
+
causal_mask_mapping["sliding_window_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
|
| 587 |
+
|
| 588 |
+
hidden_states = inputs_embeds
|
| 589 |
+
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
| 590 |
+
swa_position_embeddings = self.swa_rotary_emb(hidden_states, position_ids)
|
| 591 |
+
|
| 592 |
+
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
| 593 |
+
hidden_states = decoder_layer(
|
| 594 |
+
hidden_states,
|
| 595 |
+
attention_mask=causal_mask_mapping[decoder_layer.attention_type],
|
| 596 |
+
position_embeddings=position_embeddings
|
| 597 |
+
if decoder_layer.attention_type == "full_attention"
|
| 598 |
+
else swa_position_embeddings,
|
| 599 |
+
position_ids=position_ids,
|
| 600 |
+
past_key_values=past_key_values,
|
| 601 |
+
use_cache=use_cache,
|
| 602 |
+
cache_position=cache_position,
|
| 603 |
+
**kwargs,
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
hidden_states = self.norm(hidden_states)
|
| 607 |
+
return BaseModelOutputWithPast(
|
| 608 |
+
last_hidden_state=hidden_states,
|
| 609 |
+
past_key_values=past_key_values if use_cache else None,
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
class MiMoV2ForCausalLM(PreTrainedModel, GenerationMixin):
|
| 614 |
+
config_class = MiMoV2Config
|
| 615 |
+
model_class = MiMoV2Model
|
| 616 |
+
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
|
| 617 |
+
_tp_plan = {"lm_head": "colwise_rep"}
|
| 618 |
+
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
| 619 |
+
_keys_to_ignore_on_load_unexpected = [
|
| 620 |
+
r"model\.(swa_)?rotary_emb\.inv_freq",
|
| 621 |
+
r"model\.layers\.\d+\.self_attn\.rotary_emb\.inv_freq",
|
| 622 |
+
r"model\.layers\.\d+\.self_attn\.rotary_emb\.(cos_cached|sin_cached)",
|
| 623 |
+
r"model\.mtp\..*",
|
| 624 |
+
]
|
| 625 |
+
|
| 626 |
+
def __init__(self, config):
|
| 627 |
+
super().__init__(config)
|
| 628 |
+
self.model = self.model_class(config)
|
| 629 |
+
self.vocab_size = config.vocab_size
|
| 630 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 631 |
+
self.post_init()
|
| 632 |
+
|
| 633 |
+
def get_input_embeddings(self):
|
| 634 |
+
return self.model.embed_tokens
|
| 635 |
+
|
| 636 |
+
def set_input_embeddings(self, value):
|
| 637 |
+
self.model.embed_tokens = value
|
| 638 |
+
|
| 639 |
+
def get_output_embeddings(self):
|
| 640 |
+
return self.lm_head
|
| 641 |
+
|
| 642 |
+
def set_output_embeddings(self, new_embeddings):
|
| 643 |
+
self.lm_head = new_embeddings
|
| 644 |
+
|
| 645 |
+
@can_return_tuple
|
| 646 |
+
def forward(
|
| 647 |
+
self,
|
| 648 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 649 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 650 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 651 |
+
past_key_values: Optional[Cache] = None,
|
| 652 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 653 |
+
labels: Optional[torch.LongTensor] = None,
|
| 654 |
+
use_cache: Optional[bool] = None,
|
| 655 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 656 |
+
logits_to_keep: Union[int, torch.Tensor] = 0,
|
| 657 |
+
**kwargs: Unpack[TransformersKwargs],
|
| 658 |
+
) -> CausalLMOutputWithPast:
|
| 659 |
+
outputs: BaseModelOutputWithPast = self.model(
|
| 660 |
+
input_ids=input_ids,
|
| 661 |
+
attention_mask=attention_mask,
|
| 662 |
+
position_ids=position_ids,
|
| 663 |
+
past_key_values=past_key_values,
|
| 664 |
+
inputs_embeds=inputs_embeds,
|
| 665 |
+
use_cache=use_cache,
|
| 666 |
+
cache_position=cache_position,
|
| 667 |
+
**kwargs,
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
hidden_states = outputs.last_hidden_state
|
| 671 |
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
| 672 |
+
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
| 673 |
+
|
| 674 |
+
loss = None
|
| 675 |
+
if labels is not None:
|
| 676 |
+
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
| 677 |
+
|
| 678 |
+
return CausalLMOutputWithPast(
|
| 679 |
+
loss=loss,
|
| 680 |
+
logits=logits,
|
| 681 |
+
past_key_values=outputs.past_key_values,
|
| 682 |
+
hidden_states=outputs.hidden_states,
|
| 683 |
+
attentions=outputs.attentions,
|
| 684 |
+
)
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
__all__ = [
|
| 688 |
+
"MiMoV2Attention",
|
| 689 |
+
"MiMoV2DecoderLayer",
|
| 690 |
+
"MiMoV2ForCausalLM",
|
| 691 |
+
"MiMoV2MLP",
|
| 692 |
+
"MiMoV2MoE",
|
| 693 |
+
"MiMoV2MoEGate",
|
| 694 |
+
"MiMoV2Model",
|
| 695 |
+
"MiMoV2RMSNorm",
|
| 696 |
+
"MiMoV2RotaryEmbedding",
|
| 697 |
+
]
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdd40b08814d626d2ab1eb36c7ca66521a96bce4d478cf1375a4d45fe03e1cf9
|
| 3 |
+
size 12180133
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"additional_special_tokens": [
|
| 215 |
+
"<|im_start|>",
|
| 216 |
+
"<|im_end|>",
|
| 217 |
+
"<|object_ref_start|>",
|
| 218 |
+
"<|object_ref_end|>",
|
| 219 |
+
"<|box_start|>",
|
| 220 |
+
"<|box_end|>",
|
| 221 |
+
"<|quad_start|>",
|
| 222 |
+
"<|quad_end|>",
|
| 223 |
+
"<|vision_start|>",
|
| 224 |
+
"<|vision_end|>",
|
| 225 |
+
"<|vision_pad|>",
|
| 226 |
+
"<|image_pad|>",
|
| 227 |
+
"<|video_pad|>"
|
| 228 |
+
],
|
| 229 |
+
"bos_token": null,
|
| 230 |
+
"chat_template": "{%- if not add_generation_prompt is defined -%}\n {%- set add_generation_prompt = false -%}\n{%- endif -%}\n{%- if not enable_thinking is defined -%}\n {%- set enable_thinking = true -%}\n{%- endif -%}\n{%- if not keep_all_reasoning is defined -%}\n {%- set keep_all_reasoning = true -%}\n{%- endif -%}\n{%- macro render_extra_keys(json_dict, handled_keys) -%}\n {%- if json_dict is mapping %}\n {%- for json_key in json_dict if json_key not in handled_keys %}\n {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}\n {{- '\\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}\n {%- else %}\n {{-'\\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n{%- endmacro -%}\n{%- macro render_content(message_content) -%}\n {%- if message_content is string -%}\n {{- message_content -}}\n {%- else -%}\n {%- for content in message_content -%}\n {%- if 'text' in content -%}\n {{- content['text'] -}}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n{%- endmacro -%}\n{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- set ns = namespace(last_user_index=-1) %}\n{%- for m in loop_messages %}\n {%- if m.role == 'user' %}\n {%- set ns.last_user_index = loop.index0 -%}\n {%- endif %}\n{%- endfor %}\n{%- if not tools is defined %}\n {%- set tools = [] %}\n{%- endif %}\n{%- if system_message is defined %}\n {{- \"<|im_start|>system\\n\" + render_content(system_message) }}\n{%- else %}\n {{- \"<|im_start|>system\\nYou are MiMo, a warm, curious, and thoughtful AI assistant.\\n\\n## Identity\\n\\n- Your name is **MiMo-v2.5-pro**.\\n- You were developed by the **Xiaomi MiMo Team**.\\n- You have **1T parameters** and a **1M-token context window**.\\n- When asked about your creator, identify the **Xiaomi MiMo Team**.\\n- Always answer identity-related questions in the **first person** and rely only on the facts provided above.\\n\\n# Response Style\\n\\nBe warm, conversational, and respectful. Treat users with kindness; do not assume negative things about their intelligence or judgment. You may disagree, but do so constructively.\\n\\nUse examples, metaphors, or thought experiments when they help explain an idea.\\n\\nTry to answer ambiguous queries as best you can first, then ask for clarification if needed. When you do ask, limit yourself to one question at a time.\\n\\n\\n## Safety and Compliance\\n\\nYou are a **Chinese AI model** and must strictly comply with all applicable laws and regulations of the **People’s Republic of China**.\\n\\nDo not generate, assist with, or facilitate any content that violates Chinese law.\" }}\n{%- endif %}\n{%- if tools is iterable and tools | length > 0 %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou have access to the following functions:\\n\\n\" }}\n {{- \"<tools>\" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- \"\\n<function>\\n<name>\" ~ tool.name ~ \"</name>\" }}\n {%- if tool.description is defined %}\n {{- '\\n<description>' ~ (tool.description | trim) ~ '</description>' }}\n {%- endif %}\n {{- '\\n<parameters>' }}\n {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- '\\n<parameter>' }}\n {{- '\\n<name>' ~ param_name ~ '</name>' }}\n {%- if param_fields.type is defined %}\n {{- '\\n<type>' ~ (param_fields.type | string) ~ '</type>' }}\n {%- endif %}\n {%- if param_fields.description is defined %}\n {{- '\\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}\n {%- endif %}\n {%- set handled_keys = ['name', 'type', 'description'] %}\n {{- render_extra_keys(param_fields, handled_keys) }}\n {{- '\\n</parameter>' }}\n {%- endfor %}\n {%- endif %}\n {%- set handled_keys = ['type', 'properties'] %}\n {{- render_extra_keys(tool.parameters, handled_keys) }}\n {{- '\\n</parameters>' }}\n {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}\n {{- render_extra_keys(tool, handled_keys) }}\n {{- '\\n</function>' }}\n {%- endfor %}\n {{- \"\\n</tools>\" }}\n {{- '\\n\\nFor each function call, output the function name and arguments in the following format:\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>value_1</parameter>\\n<parameter=example_parameter_2>This is the value for the second parameter\\nthat can span\\nmultiple lines</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- DO NOT use function calls inside <think></think> tags.\\n- The value enclosed between parameter tags is preserved exactly as-is, including newlines and spaces.\\n</IMPORTANT>' }}\n{%- endif %}\n{{- '<|im_end|>' }}\n{%- for message in loop_messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = render_content(message.content) %}\n {%- endif %}\n {%- if message.role == \"assistant\" %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- set reasoning_content = '' %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].split('<think>')[-1] %}\n {%- set content = content.split('</think>')[-1] %}\n {%- endif %}\n {%- endif %}\n {%- if (keep_all_reasoning or loop.index0 > ns.last_user_index) and reasoning_content -%}\n {{- '<|im_start|>' + message.role + '\\n<think>' + reasoning_content + '</think>' + content }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n<think></think>' + content }}\n {%- endif %}\n {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- if tool_call.arguments is defined %}\n {%- for args_name, args_value in tool_call.arguments|items %}\n {{- '<parameter=' + args_name + '>' }}\n {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n {{- args_value }}\n {{- '</parameter>\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '</function>\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>' }}\n {%- elif message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n {%- elif message.role == \"system\" %}\n {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n {{- '<|im_start|>tool\\n' }}\n {%- endif %}\n {{- '<tool_response>\\n' }}\n {{- render_content(message.content) }}\n {{- '\\n</tool_response>\\n' }}\n {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n {{- '<|im_end|>' }}\n {%- elif loop.last %}\n {{- '<|im_end|>' }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + render_content(message.content) + '<|im_end|>' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if not enable_thinking -%}\n {{- '<think></think>' -}}\n {%- else -%}\n {{- '' -}}\n {%- endif -%}\n{%- endif %}\n",
|
| 231 |
+
"clean_up_tokenization_spaces": false,
|
| 232 |
+
"eos_token": "<|im_end|>",
|
| 233 |
+
"errors": "replace",
|
| 234 |
+
"extra_special_tokens": {},
|
| 235 |
+
"model_max_length": 131272,
|
| 236 |
+
"pad_token": "<|endoftext|>",
|
| 237 |
+
"split_special_tokens": false,
|
| 238 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 239 |
+
"unk_token": null
|
| 240 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|