cpatonn commited on
Commit
fd002a9
·
verified ·
1 Parent(s): cc911d1

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +60 -0
config.json CHANGED
@@ -111,6 +111,7 @@
111
  "model.layers.0.linear_attn.in_proj_ba",
112
  "model.layers.0.linear_attn.out_proj",
113
  "model.layers.0.mlp.gate",
 
114
  "model.layers.0.mlp.shared_expert.gate_proj",
115
  "model.layers.0.mlp.shared_expert.up_proj",
116
  "model.layers.0.mlp.shared_expert.down_proj",
@@ -119,6 +120,7 @@
119
  "model.layers.1.linear_attn.in_proj_ba",
120
  "model.layers.1.linear_attn.out_proj",
121
  "model.layers.1.mlp.gate",
 
122
  "model.layers.1.mlp.shared_expert.gate_proj",
123
  "model.layers.1.mlp.shared_expert.up_proj",
124
  "model.layers.1.mlp.shared_expert.down_proj",
@@ -127,15 +129,18 @@
127
  "model.layers.2.linear_attn.in_proj_ba",
128
  "model.layers.2.linear_attn.out_proj",
129
  "model.layers.2.mlp.gate",
 
130
  "model.layers.2.mlp.shared_expert.gate_proj",
131
  "model.layers.2.mlp.shared_expert.up_proj",
132
  "model.layers.2.mlp.shared_expert.down_proj",
133
  "model.layers.2.mlp.shared_expert_gate",
 
134
  "model.layers.3.self_attn.q_proj",
135
  "model.layers.3.self_attn.k_proj",
136
  "model.layers.3.self_attn.v_proj",
137
  "model.layers.3.self_attn.o_proj",
138
  "model.layers.3.mlp.gate",
 
139
  "model.layers.3.mlp.shared_expert.gate_proj",
140
  "model.layers.3.mlp.shared_expert.up_proj",
141
  "model.layers.3.mlp.shared_expert.down_proj",
@@ -144,6 +149,7 @@
144
  "model.layers.4.linear_attn.in_proj_ba",
145
  "model.layers.4.linear_attn.out_proj",
146
  "model.layers.4.mlp.gate",
 
147
  "model.layers.4.mlp.shared_expert.gate_proj",
148
  "model.layers.4.mlp.shared_expert.up_proj",
149
  "model.layers.4.mlp.shared_expert.down_proj",
@@ -152,6 +158,7 @@
152
  "model.layers.5.linear_attn.in_proj_ba",
153
  "model.layers.5.linear_attn.out_proj",
154
  "model.layers.5.mlp.gate",
 
155
  "model.layers.5.mlp.shared_expert.gate_proj",
156
  "model.layers.5.mlp.shared_expert.up_proj",
157
  "model.layers.5.mlp.shared_expert.down_proj",
@@ -160,15 +167,18 @@
160
  "model.layers.6.linear_attn.in_proj_ba",
161
  "model.layers.6.linear_attn.out_proj",
162
  "model.layers.6.mlp.gate",
 
163
  "model.layers.6.mlp.shared_expert.gate_proj",
164
  "model.layers.6.mlp.shared_expert.up_proj",
165
  "model.layers.6.mlp.shared_expert.down_proj",
166
  "model.layers.6.mlp.shared_expert_gate",
 
167
  "model.layers.7.self_attn.q_proj",
168
  "model.layers.7.self_attn.k_proj",
169
  "model.layers.7.self_attn.v_proj",
170
  "model.layers.7.self_attn.o_proj",
171
  "model.layers.7.mlp.gate",
 
172
  "model.layers.7.mlp.shared_expert.gate_proj",
173
  "model.layers.7.mlp.shared_expert.up_proj",
174
  "model.layers.7.mlp.shared_expert.down_proj",
@@ -177,6 +187,7 @@
177
  "model.layers.8.linear_attn.in_proj_ba",
178
  "model.layers.8.linear_attn.out_proj",
179
  "model.layers.8.mlp.gate",
 
180
  "model.layers.8.mlp.shared_expert.gate_proj",
181
  "model.layers.8.mlp.shared_expert.up_proj",
182
  "model.layers.8.mlp.shared_expert.down_proj",
@@ -185,6 +196,7 @@
185
  "model.layers.9.linear_attn.in_proj_ba",
186
  "model.layers.9.linear_attn.out_proj",
187
  "model.layers.9.mlp.gate",
 
188
  "model.layers.9.mlp.shared_expert.gate_proj",
189
  "model.layers.9.mlp.shared_expert.up_proj",
190
  "model.layers.9.mlp.shared_expert.down_proj",
@@ -193,15 +205,18 @@
193
  "model.layers.10.linear_attn.in_proj_ba",
194
  "model.layers.10.linear_attn.out_proj",
195
  "model.layers.10.mlp.gate",
 
196
  "model.layers.10.mlp.shared_expert.gate_proj",
197
  "model.layers.10.mlp.shared_expert.up_proj",
198
  "model.layers.10.mlp.shared_expert.down_proj",
199
  "model.layers.10.mlp.shared_expert_gate",
 
200
  "model.layers.11.self_attn.q_proj",
201
  "model.layers.11.self_attn.k_proj",
202
  "model.layers.11.self_attn.v_proj",
203
  "model.layers.11.self_attn.o_proj",
204
  "model.layers.11.mlp.gate",
 
205
  "model.layers.11.mlp.shared_expert.gate_proj",
206
  "model.layers.11.mlp.shared_expert.up_proj",
207
  "model.layers.11.mlp.shared_expert.down_proj",
@@ -210,6 +225,7 @@
210
  "model.layers.12.linear_attn.in_proj_ba",
211
  "model.layers.12.linear_attn.out_proj",
212
  "model.layers.12.mlp.gate",
 
213
  "model.layers.12.mlp.shared_expert.gate_proj",
214
  "model.layers.12.mlp.shared_expert.up_proj",
215
  "model.layers.12.mlp.shared_expert.down_proj",
@@ -218,6 +234,7 @@
218
  "model.layers.13.linear_attn.in_proj_ba",
219
  "model.layers.13.linear_attn.out_proj",
220
  "model.layers.13.mlp.gate",
 
221
  "model.layers.13.mlp.shared_expert.gate_proj",
222
  "model.layers.13.mlp.shared_expert.up_proj",
223
  "model.layers.13.mlp.shared_expert.down_proj",
@@ -226,15 +243,18 @@
226
  "model.layers.14.linear_attn.in_proj_ba",
227
  "model.layers.14.linear_attn.out_proj",
228
  "model.layers.14.mlp.gate",
 
229
  "model.layers.14.mlp.shared_expert.gate_proj",
230
  "model.layers.14.mlp.shared_expert.up_proj",
231
  "model.layers.14.mlp.shared_expert.down_proj",
232
  "model.layers.14.mlp.shared_expert_gate",
 
233
  "model.layers.15.self_attn.q_proj",
234
  "model.layers.15.self_attn.k_proj",
235
  "model.layers.15.self_attn.v_proj",
236
  "model.layers.15.self_attn.o_proj",
237
  "model.layers.15.mlp.gate",
 
238
  "model.layers.15.mlp.shared_expert.gate_proj",
239
  "model.layers.15.mlp.shared_expert.up_proj",
240
  "model.layers.15.mlp.shared_expert.down_proj",
@@ -243,6 +263,7 @@
243
  "model.layers.16.linear_attn.in_proj_ba",
244
  "model.layers.16.linear_attn.out_proj",
245
  "model.layers.16.mlp.gate",
 
246
  "model.layers.16.mlp.shared_expert.gate_proj",
247
  "model.layers.16.mlp.shared_expert.up_proj",
248
  "model.layers.16.mlp.shared_expert.down_proj",
@@ -251,6 +272,7 @@
251
  "model.layers.17.linear_attn.in_proj_ba",
252
  "model.layers.17.linear_attn.out_proj",
253
  "model.layers.17.mlp.gate",
 
254
  "model.layers.17.mlp.shared_expert.gate_proj",
255
  "model.layers.17.mlp.shared_expert.up_proj",
256
  "model.layers.17.mlp.shared_expert.down_proj",
@@ -259,15 +281,18 @@
259
  "model.layers.18.linear_attn.in_proj_ba",
260
  "model.layers.18.linear_attn.out_proj",
261
  "model.layers.18.mlp.gate",
 
262
  "model.layers.18.mlp.shared_expert.gate_proj",
263
  "model.layers.18.mlp.shared_expert.up_proj",
264
  "model.layers.18.mlp.shared_expert.down_proj",
265
  "model.layers.18.mlp.shared_expert_gate",
 
266
  "model.layers.19.self_attn.q_proj",
267
  "model.layers.19.self_attn.k_proj",
268
  "model.layers.19.self_attn.v_proj",
269
  "model.layers.19.self_attn.o_proj",
270
  "model.layers.19.mlp.gate",
 
271
  "model.layers.19.mlp.shared_expert.gate_proj",
272
  "model.layers.19.mlp.shared_expert.up_proj",
273
  "model.layers.19.mlp.shared_expert.down_proj",
@@ -276,6 +301,7 @@
276
  "model.layers.20.linear_attn.in_proj_ba",
277
  "model.layers.20.linear_attn.out_proj",
278
  "model.layers.20.mlp.gate",
 
279
  "model.layers.20.mlp.shared_expert.gate_proj",
280
  "model.layers.20.mlp.shared_expert.up_proj",
281
  "model.layers.20.mlp.shared_expert.down_proj",
@@ -284,6 +310,7 @@
284
  "model.layers.21.linear_attn.in_proj_ba",
285
  "model.layers.21.linear_attn.out_proj",
286
  "model.layers.21.mlp.gate",
 
287
  "model.layers.21.mlp.shared_expert.gate_proj",
288
  "model.layers.21.mlp.shared_expert.up_proj",
289
  "model.layers.21.mlp.shared_expert.down_proj",
@@ -292,15 +319,18 @@
292
  "model.layers.22.linear_attn.in_proj_ba",
293
  "model.layers.22.linear_attn.out_proj",
294
  "model.layers.22.mlp.gate",
 
295
  "model.layers.22.mlp.shared_expert.gate_proj",
296
  "model.layers.22.mlp.shared_expert.up_proj",
297
  "model.layers.22.mlp.shared_expert.down_proj",
298
  "model.layers.22.mlp.shared_expert_gate",
 
299
  "model.layers.23.self_attn.q_proj",
300
  "model.layers.23.self_attn.k_proj",
301
  "model.layers.23.self_attn.v_proj",
302
  "model.layers.23.self_attn.o_proj",
303
  "model.layers.23.mlp.gate",
 
304
  "model.layers.23.mlp.shared_expert.gate_proj",
305
  "model.layers.23.mlp.shared_expert.up_proj",
306
  "model.layers.23.mlp.shared_expert.down_proj",
@@ -309,6 +339,7 @@
309
  "model.layers.24.linear_attn.in_proj_ba",
310
  "model.layers.24.linear_attn.out_proj",
311
  "model.layers.24.mlp.gate",
 
312
  "model.layers.24.mlp.shared_expert.gate_proj",
313
  "model.layers.24.mlp.shared_expert.up_proj",
314
  "model.layers.24.mlp.shared_expert.down_proj",
@@ -317,6 +348,7 @@
317
  "model.layers.25.linear_attn.in_proj_ba",
318
  "model.layers.25.linear_attn.out_proj",
319
  "model.layers.25.mlp.gate",
 
320
  "model.layers.25.mlp.shared_expert.gate_proj",
321
  "model.layers.25.mlp.shared_expert.up_proj",
322
  "model.layers.25.mlp.shared_expert.down_proj",
@@ -325,15 +357,18 @@
325
  "model.layers.26.linear_attn.in_proj_ba",
326
  "model.layers.26.linear_attn.out_proj",
327
  "model.layers.26.mlp.gate",
 
328
  "model.layers.26.mlp.shared_expert.gate_proj",
329
  "model.layers.26.mlp.shared_expert.up_proj",
330
  "model.layers.26.mlp.shared_expert.down_proj",
331
  "model.layers.26.mlp.shared_expert_gate",
 
332
  "model.layers.27.self_attn.q_proj",
333
  "model.layers.27.self_attn.k_proj",
334
  "model.layers.27.self_attn.v_proj",
335
  "model.layers.27.self_attn.o_proj",
336
  "model.layers.27.mlp.gate",
 
337
  "model.layers.27.mlp.shared_expert.gate_proj",
338
  "model.layers.27.mlp.shared_expert.up_proj",
339
  "model.layers.27.mlp.shared_expert.down_proj",
@@ -342,6 +377,7 @@
342
  "model.layers.28.linear_attn.in_proj_ba",
343
  "model.layers.28.linear_attn.out_proj",
344
  "model.layers.28.mlp.gate",
 
345
  "model.layers.28.mlp.shared_expert.gate_proj",
346
  "model.layers.28.mlp.shared_expert.up_proj",
347
  "model.layers.28.mlp.shared_expert.down_proj",
@@ -350,6 +386,7 @@
350
  "model.layers.29.linear_attn.in_proj_ba",
351
  "model.layers.29.linear_attn.out_proj",
352
  "model.layers.29.mlp.gate",
 
353
  "model.layers.29.mlp.shared_expert.gate_proj",
354
  "model.layers.29.mlp.shared_expert.up_proj",
355
  "model.layers.29.mlp.shared_expert.down_proj",
@@ -358,15 +395,18 @@
358
  "model.layers.30.linear_attn.in_proj_ba",
359
  "model.layers.30.linear_attn.out_proj",
360
  "model.layers.30.mlp.gate",
 
361
  "model.layers.30.mlp.shared_expert.gate_proj",
362
  "model.layers.30.mlp.shared_expert.up_proj",
363
  "model.layers.30.mlp.shared_expert.down_proj",
364
  "model.layers.30.mlp.shared_expert_gate",
 
365
  "model.layers.31.self_attn.q_proj",
366
  "model.layers.31.self_attn.k_proj",
367
  "model.layers.31.self_attn.v_proj",
368
  "model.layers.31.self_attn.o_proj",
369
  "model.layers.31.mlp.gate",
 
370
  "model.layers.31.mlp.shared_expert.gate_proj",
371
  "model.layers.31.mlp.shared_expert.up_proj",
372
  "model.layers.31.mlp.shared_expert.down_proj",
@@ -375,6 +415,7 @@
375
  "model.layers.32.linear_attn.in_proj_ba",
376
  "model.layers.32.linear_attn.out_proj",
377
  "model.layers.32.mlp.gate",
 
378
  "model.layers.32.mlp.shared_expert.gate_proj",
379
  "model.layers.32.mlp.shared_expert.up_proj",
380
  "model.layers.32.mlp.shared_expert.down_proj",
@@ -383,6 +424,7 @@
383
  "model.layers.33.linear_attn.in_proj_ba",
384
  "model.layers.33.linear_attn.out_proj",
385
  "model.layers.33.mlp.gate",
 
386
  "model.layers.33.mlp.shared_expert.gate_proj",
387
  "model.layers.33.mlp.shared_expert.up_proj",
388
  "model.layers.33.mlp.shared_expert.down_proj",
@@ -391,15 +433,18 @@
391
  "model.layers.34.linear_attn.in_proj_ba",
392
  "model.layers.34.linear_attn.out_proj",
393
  "model.layers.34.mlp.gate",
 
394
  "model.layers.34.mlp.shared_expert.gate_proj",
395
  "model.layers.34.mlp.shared_expert.up_proj",
396
  "model.layers.34.mlp.shared_expert.down_proj",
397
  "model.layers.34.mlp.shared_expert_gate",
 
398
  "model.layers.35.self_attn.q_proj",
399
  "model.layers.35.self_attn.k_proj",
400
  "model.layers.35.self_attn.v_proj",
401
  "model.layers.35.self_attn.o_proj",
402
  "model.layers.35.mlp.gate",
 
403
  "model.layers.35.mlp.shared_expert.gate_proj",
404
  "model.layers.35.mlp.shared_expert.up_proj",
405
  "model.layers.35.mlp.shared_expert.down_proj",
@@ -408,6 +453,7 @@
408
  "model.layers.36.linear_attn.in_proj_ba",
409
  "model.layers.36.linear_attn.out_proj",
410
  "model.layers.36.mlp.gate",
 
411
  "model.layers.36.mlp.shared_expert.gate_proj",
412
  "model.layers.36.mlp.shared_expert.up_proj",
413
  "model.layers.36.mlp.shared_expert.down_proj",
@@ -416,6 +462,7 @@
416
  "model.layers.37.linear_attn.in_proj_ba",
417
  "model.layers.37.linear_attn.out_proj",
418
  "model.layers.37.mlp.gate",
 
419
  "model.layers.37.mlp.shared_expert.gate_proj",
420
  "model.layers.37.mlp.shared_expert.up_proj",
421
  "model.layers.37.mlp.shared_expert.down_proj",
@@ -424,15 +471,18 @@
424
  "model.layers.38.linear_attn.in_proj_ba",
425
  "model.layers.38.linear_attn.out_proj",
426
  "model.layers.38.mlp.gate",
 
427
  "model.layers.38.mlp.shared_expert.gate_proj",
428
  "model.layers.38.mlp.shared_expert.up_proj",
429
  "model.layers.38.mlp.shared_expert.down_proj",
430
  "model.layers.38.mlp.shared_expert_gate",
 
431
  "model.layers.39.self_attn.q_proj",
432
  "model.layers.39.self_attn.k_proj",
433
  "model.layers.39.self_attn.v_proj",
434
  "model.layers.39.self_attn.o_proj",
435
  "model.layers.39.mlp.gate",
 
436
  "model.layers.39.mlp.shared_expert.gate_proj",
437
  "model.layers.39.mlp.shared_expert.up_proj",
438
  "model.layers.39.mlp.shared_expert.down_proj",
@@ -441,6 +491,7 @@
441
  "model.layers.40.linear_attn.in_proj_ba",
442
  "model.layers.40.linear_attn.out_proj",
443
  "model.layers.40.mlp.gate",
 
444
  "model.layers.40.mlp.shared_expert.gate_proj",
445
  "model.layers.40.mlp.shared_expert.up_proj",
446
  "model.layers.40.mlp.shared_expert.down_proj",
@@ -449,6 +500,7 @@
449
  "model.layers.41.linear_attn.in_proj_ba",
450
  "model.layers.41.linear_attn.out_proj",
451
  "model.layers.41.mlp.gate",
 
452
  "model.layers.41.mlp.shared_expert.gate_proj",
453
  "model.layers.41.mlp.shared_expert.up_proj",
454
  "model.layers.41.mlp.shared_expert.down_proj",
@@ -457,15 +509,18 @@
457
  "model.layers.42.linear_attn.in_proj_ba",
458
  "model.layers.42.linear_attn.out_proj",
459
  "model.layers.42.mlp.gate",
 
460
  "model.layers.42.mlp.shared_expert.gate_proj",
461
  "model.layers.42.mlp.shared_expert.up_proj",
462
  "model.layers.42.mlp.shared_expert.down_proj",
463
  "model.layers.42.mlp.shared_expert_gate",
 
464
  "model.layers.43.self_attn.q_proj",
465
  "model.layers.43.self_attn.k_proj",
466
  "model.layers.43.self_attn.v_proj",
467
  "model.layers.43.self_attn.o_proj",
468
  "model.layers.43.mlp.gate",
 
469
  "model.layers.43.mlp.shared_expert.gate_proj",
470
  "model.layers.43.mlp.shared_expert.up_proj",
471
  "model.layers.43.mlp.shared_expert.down_proj",
@@ -474,6 +529,7 @@
474
  "model.layers.44.linear_attn.in_proj_ba",
475
  "model.layers.44.linear_attn.out_proj",
476
  "model.layers.44.mlp.gate",
 
477
  "model.layers.44.mlp.shared_expert.gate_proj",
478
  "model.layers.44.mlp.shared_expert.up_proj",
479
  "model.layers.44.mlp.shared_expert.down_proj",
@@ -482,6 +538,7 @@
482
  "model.layers.45.linear_attn.in_proj_ba",
483
  "model.layers.45.linear_attn.out_proj",
484
  "model.layers.45.mlp.gate",
 
485
  "model.layers.45.mlp.shared_expert.gate_proj",
486
  "model.layers.45.mlp.shared_expert.up_proj",
487
  "model.layers.45.mlp.shared_expert.down_proj",
@@ -490,15 +547,18 @@
490
  "model.layers.46.linear_attn.in_proj_ba",
491
  "model.layers.46.linear_attn.out_proj",
492
  "model.layers.46.mlp.gate",
 
493
  "model.layers.46.mlp.shared_expert.gate_proj",
494
  "model.layers.46.mlp.shared_expert.up_proj",
495
  "model.layers.46.mlp.shared_expert.down_proj",
496
  "model.layers.46.mlp.shared_expert_gate",
 
497
  "model.layers.47.self_attn.q_proj",
498
  "model.layers.47.self_attn.k_proj",
499
  "model.layers.47.self_attn.v_proj",
500
  "model.layers.47.self_attn.o_proj",
501
  "model.layers.47.mlp.gate",
 
502
  "model.layers.47.mlp.shared_expert.gate_proj",
503
  "model.layers.47.mlp.shared_expert.up_proj",
504
  "model.layers.47.mlp.shared_expert.down_proj",
 
111
  "model.layers.0.linear_attn.in_proj_ba",
112
  "model.layers.0.linear_attn.out_proj",
113
  "model.layers.0.mlp.gate",
114
+ "model.layers.0.mlp.shared_expert.gate_up_proj",
115
  "model.layers.0.mlp.shared_expert.gate_proj",
116
  "model.layers.0.mlp.shared_expert.up_proj",
117
  "model.layers.0.mlp.shared_expert.down_proj",
 
120
  "model.layers.1.linear_attn.in_proj_ba",
121
  "model.layers.1.linear_attn.out_proj",
122
  "model.layers.1.mlp.gate",
123
+ "model.layers.1.mlp.shared_expert.gate_up_proj",
124
  "model.layers.1.mlp.shared_expert.gate_proj",
125
  "model.layers.1.mlp.shared_expert.up_proj",
126
  "model.layers.1.mlp.shared_expert.down_proj",
 
129
  "model.layers.2.linear_attn.in_proj_ba",
130
  "model.layers.2.linear_attn.out_proj",
131
  "model.layers.2.mlp.gate",
132
+ "model.layers.2.mlp.shared_expert.gate_up_proj",
133
  "model.layers.2.mlp.shared_expert.gate_proj",
134
  "model.layers.2.mlp.shared_expert.up_proj",
135
  "model.layers.2.mlp.shared_expert.down_proj",
136
  "model.layers.2.mlp.shared_expert_gate",
137
+ "model.layers.3.self_attn.qkv_proj",
138
  "model.layers.3.self_attn.q_proj",
139
  "model.layers.3.self_attn.k_proj",
140
  "model.layers.3.self_attn.v_proj",
141
  "model.layers.3.self_attn.o_proj",
142
  "model.layers.3.mlp.gate",
143
+ "model.layers.3.mlp.shared_expert.gate_up_proj",
144
  "model.layers.3.mlp.shared_expert.gate_proj",
145
  "model.layers.3.mlp.shared_expert.up_proj",
146
  "model.layers.3.mlp.shared_expert.down_proj",
 
149
  "model.layers.4.linear_attn.in_proj_ba",
150
  "model.layers.4.linear_attn.out_proj",
151
  "model.layers.4.mlp.gate",
152
+ "model.layers.4.mlp.shared_expert.gate_up_proj",
153
  "model.layers.4.mlp.shared_expert.gate_proj",
154
  "model.layers.4.mlp.shared_expert.up_proj",
155
  "model.layers.4.mlp.shared_expert.down_proj",
 
158
  "model.layers.5.linear_attn.in_proj_ba",
159
  "model.layers.5.linear_attn.out_proj",
160
  "model.layers.5.mlp.gate",
161
+ "model.layers.5.mlp.shared_expert.gate_up_proj",
162
  "model.layers.5.mlp.shared_expert.gate_proj",
163
  "model.layers.5.mlp.shared_expert.up_proj",
164
  "model.layers.5.mlp.shared_expert.down_proj",
 
167
  "model.layers.6.linear_attn.in_proj_ba",
168
  "model.layers.6.linear_attn.out_proj",
169
  "model.layers.6.mlp.gate",
170
+ "model.layers.6.mlp.shared_expert.gate_up_proj",
171
  "model.layers.6.mlp.shared_expert.gate_proj",
172
  "model.layers.6.mlp.shared_expert.up_proj",
173
  "model.layers.6.mlp.shared_expert.down_proj",
174
  "model.layers.6.mlp.shared_expert_gate",
175
+ "model.layers.7.self_attn.qkv_proj",
176
  "model.layers.7.self_attn.q_proj",
177
  "model.layers.7.self_attn.k_proj",
178
  "model.layers.7.self_attn.v_proj",
179
  "model.layers.7.self_attn.o_proj",
180
  "model.layers.7.mlp.gate",
181
+ "model.layers.7.mlp.shared_expert.gate_up_proj",
182
  "model.layers.7.mlp.shared_expert.gate_proj",
183
  "model.layers.7.mlp.shared_expert.up_proj",
184
  "model.layers.7.mlp.shared_expert.down_proj",
 
187
  "model.layers.8.linear_attn.in_proj_ba",
188
  "model.layers.8.linear_attn.out_proj",
189
  "model.layers.8.mlp.gate",
190
+ "model.layers.8.mlp.shared_expert.gate_up_proj",
191
  "model.layers.8.mlp.shared_expert.gate_proj",
192
  "model.layers.8.mlp.shared_expert.up_proj",
193
  "model.layers.8.mlp.shared_expert.down_proj",
 
196
  "model.layers.9.linear_attn.in_proj_ba",
197
  "model.layers.9.linear_attn.out_proj",
198
  "model.layers.9.mlp.gate",
199
+ "model.layers.9.mlp.shared_expert.gate_up_proj",
200
  "model.layers.9.mlp.shared_expert.gate_proj",
201
  "model.layers.9.mlp.shared_expert.up_proj",
202
  "model.layers.9.mlp.shared_expert.down_proj",
 
205
  "model.layers.10.linear_attn.in_proj_ba",
206
  "model.layers.10.linear_attn.out_proj",
207
  "model.layers.10.mlp.gate",
208
+ "model.layers.10.mlp.shared_expert.gate_up_proj",
209
  "model.layers.10.mlp.shared_expert.gate_proj",
210
  "model.layers.10.mlp.shared_expert.up_proj",
211
  "model.layers.10.mlp.shared_expert.down_proj",
212
  "model.layers.10.mlp.shared_expert_gate",
213
+ "model.layers.11.self_attn.qkv_proj",
214
  "model.layers.11.self_attn.q_proj",
215
  "model.layers.11.self_attn.k_proj",
216
  "model.layers.11.self_attn.v_proj",
217
  "model.layers.11.self_attn.o_proj",
218
  "model.layers.11.mlp.gate",
219
+ "model.layers.11.mlp.shared_expert.gate_up_proj",
220
  "model.layers.11.mlp.shared_expert.gate_proj",
221
  "model.layers.11.mlp.shared_expert.up_proj",
222
  "model.layers.11.mlp.shared_expert.down_proj",
 
225
  "model.layers.12.linear_attn.in_proj_ba",
226
  "model.layers.12.linear_attn.out_proj",
227
  "model.layers.12.mlp.gate",
228
+ "model.layers.12.mlp.shared_expert.gate_up_proj",
229
  "model.layers.12.mlp.shared_expert.gate_proj",
230
  "model.layers.12.mlp.shared_expert.up_proj",
231
  "model.layers.12.mlp.shared_expert.down_proj",
 
234
  "model.layers.13.linear_attn.in_proj_ba",
235
  "model.layers.13.linear_attn.out_proj",
236
  "model.layers.13.mlp.gate",
237
+ "model.layers.13.mlp.shared_expert.gate_up_proj",
238
  "model.layers.13.mlp.shared_expert.gate_proj",
239
  "model.layers.13.mlp.shared_expert.up_proj",
240
  "model.layers.13.mlp.shared_expert.down_proj",
 
243
  "model.layers.14.linear_attn.in_proj_ba",
244
  "model.layers.14.linear_attn.out_proj",
245
  "model.layers.14.mlp.gate",
246
+ "model.layers.14.mlp.shared_expert.gate_up_proj",
247
  "model.layers.14.mlp.shared_expert.gate_proj",
248
  "model.layers.14.mlp.shared_expert.up_proj",
249
  "model.layers.14.mlp.shared_expert.down_proj",
250
  "model.layers.14.mlp.shared_expert_gate",
251
+ "model.layers.15.self_attn.qkv_proj",
252
  "model.layers.15.self_attn.q_proj",
253
  "model.layers.15.self_attn.k_proj",
254
  "model.layers.15.self_attn.v_proj",
255
  "model.layers.15.self_attn.o_proj",
256
  "model.layers.15.mlp.gate",
257
+ "model.layers.15.mlp.shared_expert.gate_up_proj",
258
  "model.layers.15.mlp.shared_expert.gate_proj",
259
  "model.layers.15.mlp.shared_expert.up_proj",
260
  "model.layers.15.mlp.shared_expert.down_proj",
 
263
  "model.layers.16.linear_attn.in_proj_ba",
264
  "model.layers.16.linear_attn.out_proj",
265
  "model.layers.16.mlp.gate",
266
+ "model.layers.16.mlp.shared_expert.gate_up_proj",
267
  "model.layers.16.mlp.shared_expert.gate_proj",
268
  "model.layers.16.mlp.shared_expert.up_proj",
269
  "model.layers.16.mlp.shared_expert.down_proj",
 
272
  "model.layers.17.linear_attn.in_proj_ba",
273
  "model.layers.17.linear_attn.out_proj",
274
  "model.layers.17.mlp.gate",
275
+ "model.layers.17.mlp.shared_expert.gate_up_proj",
276
  "model.layers.17.mlp.shared_expert.gate_proj",
277
  "model.layers.17.mlp.shared_expert.up_proj",
278
  "model.layers.17.mlp.shared_expert.down_proj",
 
281
  "model.layers.18.linear_attn.in_proj_ba",
282
  "model.layers.18.linear_attn.out_proj",
283
  "model.layers.18.mlp.gate",
284
+ "model.layers.18.mlp.shared_expert.gate_up_proj",
285
  "model.layers.18.mlp.shared_expert.gate_proj",
286
  "model.layers.18.mlp.shared_expert.up_proj",
287
  "model.layers.18.mlp.shared_expert.down_proj",
288
  "model.layers.18.mlp.shared_expert_gate",
289
+ "model.layers.19.self_attn.qkv_proj",
290
  "model.layers.19.self_attn.q_proj",
291
  "model.layers.19.self_attn.k_proj",
292
  "model.layers.19.self_attn.v_proj",
293
  "model.layers.19.self_attn.o_proj",
294
  "model.layers.19.mlp.gate",
295
+ "model.layers.19.mlp.shared_expert.gate_up_proj",
296
  "model.layers.19.mlp.shared_expert.gate_proj",
297
  "model.layers.19.mlp.shared_expert.up_proj",
298
  "model.layers.19.mlp.shared_expert.down_proj",
 
301
  "model.layers.20.linear_attn.in_proj_ba",
302
  "model.layers.20.linear_attn.out_proj",
303
  "model.layers.20.mlp.gate",
304
+ "model.layers.20.mlp.shared_expert.gate_up_proj",
305
  "model.layers.20.mlp.shared_expert.gate_proj",
306
  "model.layers.20.mlp.shared_expert.up_proj",
307
  "model.layers.20.mlp.shared_expert.down_proj",
 
310
  "model.layers.21.linear_attn.in_proj_ba",
311
  "model.layers.21.linear_attn.out_proj",
312
  "model.layers.21.mlp.gate",
313
+ "model.layers.21.mlp.shared_expert.gate_up_proj",
314
  "model.layers.21.mlp.shared_expert.gate_proj",
315
  "model.layers.21.mlp.shared_expert.up_proj",
316
  "model.layers.21.mlp.shared_expert.down_proj",
 
319
  "model.layers.22.linear_attn.in_proj_ba",
320
  "model.layers.22.linear_attn.out_proj",
321
  "model.layers.22.mlp.gate",
322
+ "model.layers.22.mlp.shared_expert.gate_up_proj",
323
  "model.layers.22.mlp.shared_expert.gate_proj",
324
  "model.layers.22.mlp.shared_expert.up_proj",
325
  "model.layers.22.mlp.shared_expert.down_proj",
326
  "model.layers.22.mlp.shared_expert_gate",
327
+ "model.layers.23.self_attn.qkv_proj",
328
  "model.layers.23.self_attn.q_proj",
329
  "model.layers.23.self_attn.k_proj",
330
  "model.layers.23.self_attn.v_proj",
331
  "model.layers.23.self_attn.o_proj",
332
  "model.layers.23.mlp.gate",
333
+ "model.layers.23.mlp.shared_expert.gate_up_proj",
334
  "model.layers.23.mlp.shared_expert.gate_proj",
335
  "model.layers.23.mlp.shared_expert.up_proj",
336
  "model.layers.23.mlp.shared_expert.down_proj",
 
339
  "model.layers.24.linear_attn.in_proj_ba",
340
  "model.layers.24.linear_attn.out_proj",
341
  "model.layers.24.mlp.gate",
342
+ "model.layers.24.mlp.shared_expert.gate_up_proj",
343
  "model.layers.24.mlp.shared_expert.gate_proj",
344
  "model.layers.24.mlp.shared_expert.up_proj",
345
  "model.layers.24.mlp.shared_expert.down_proj",
 
348
  "model.layers.25.linear_attn.in_proj_ba",
349
  "model.layers.25.linear_attn.out_proj",
350
  "model.layers.25.mlp.gate",
351
+ "model.layers.25.mlp.shared_expert.gate_up_proj",
352
  "model.layers.25.mlp.shared_expert.gate_proj",
353
  "model.layers.25.mlp.shared_expert.up_proj",
354
  "model.layers.25.mlp.shared_expert.down_proj",
 
357
  "model.layers.26.linear_attn.in_proj_ba",
358
  "model.layers.26.linear_attn.out_proj",
359
  "model.layers.26.mlp.gate",
360
+ "model.layers.26.mlp.shared_expert.gate_up_proj",
361
  "model.layers.26.mlp.shared_expert.gate_proj",
362
  "model.layers.26.mlp.shared_expert.up_proj",
363
  "model.layers.26.mlp.shared_expert.down_proj",
364
  "model.layers.26.mlp.shared_expert_gate",
365
+ "model.layers.27.self_attn.qkv_proj",
366
  "model.layers.27.self_attn.q_proj",
367
  "model.layers.27.self_attn.k_proj",
368
  "model.layers.27.self_attn.v_proj",
369
  "model.layers.27.self_attn.o_proj",
370
  "model.layers.27.mlp.gate",
371
+ "model.layers.27.mlp.shared_expert.gate_up_proj",
372
  "model.layers.27.mlp.shared_expert.gate_proj",
373
  "model.layers.27.mlp.shared_expert.up_proj",
374
  "model.layers.27.mlp.shared_expert.down_proj",
 
377
  "model.layers.28.linear_attn.in_proj_ba",
378
  "model.layers.28.linear_attn.out_proj",
379
  "model.layers.28.mlp.gate",
380
+ "model.layers.28.mlp.shared_expert.gate_up_proj",
381
  "model.layers.28.mlp.shared_expert.gate_proj",
382
  "model.layers.28.mlp.shared_expert.up_proj",
383
  "model.layers.28.mlp.shared_expert.down_proj",
 
386
  "model.layers.29.linear_attn.in_proj_ba",
387
  "model.layers.29.linear_attn.out_proj",
388
  "model.layers.29.mlp.gate",
389
+ "model.layers.29.mlp.shared_expert.gate_up_proj",
390
  "model.layers.29.mlp.shared_expert.gate_proj",
391
  "model.layers.29.mlp.shared_expert.up_proj",
392
  "model.layers.29.mlp.shared_expert.down_proj",
 
395
  "model.layers.30.linear_attn.in_proj_ba",
396
  "model.layers.30.linear_attn.out_proj",
397
  "model.layers.30.mlp.gate",
398
+ "model.layers.30.mlp.shared_expert.gate_up_proj",
399
  "model.layers.30.mlp.shared_expert.gate_proj",
400
  "model.layers.30.mlp.shared_expert.up_proj",
401
  "model.layers.30.mlp.shared_expert.down_proj",
402
  "model.layers.30.mlp.shared_expert_gate",
403
+ "model.layers.31.self_attn.qkv_proj",
404
  "model.layers.31.self_attn.q_proj",
405
  "model.layers.31.self_attn.k_proj",
406
  "model.layers.31.self_attn.v_proj",
407
  "model.layers.31.self_attn.o_proj",
408
  "model.layers.31.mlp.gate",
409
+ "model.layers.31.mlp.shared_expert.gate_up_proj",
410
  "model.layers.31.mlp.shared_expert.gate_proj",
411
  "model.layers.31.mlp.shared_expert.up_proj",
412
  "model.layers.31.mlp.shared_expert.down_proj",
 
415
  "model.layers.32.linear_attn.in_proj_ba",
416
  "model.layers.32.linear_attn.out_proj",
417
  "model.layers.32.mlp.gate",
418
+ "model.layers.32.mlp.shared_expert.gate_up_proj",
419
  "model.layers.32.mlp.shared_expert.gate_proj",
420
  "model.layers.32.mlp.shared_expert.up_proj",
421
  "model.layers.32.mlp.shared_expert.down_proj",
 
424
  "model.layers.33.linear_attn.in_proj_ba",
425
  "model.layers.33.linear_attn.out_proj",
426
  "model.layers.33.mlp.gate",
427
+ "model.layers.33.mlp.shared_expert.gate_up_proj",
428
  "model.layers.33.mlp.shared_expert.gate_proj",
429
  "model.layers.33.mlp.shared_expert.up_proj",
430
  "model.layers.33.mlp.shared_expert.down_proj",
 
433
  "model.layers.34.linear_attn.in_proj_ba",
434
  "model.layers.34.linear_attn.out_proj",
435
  "model.layers.34.mlp.gate",
436
+ "model.layers.34.mlp.shared_expert.gate_up_proj",
437
  "model.layers.34.mlp.shared_expert.gate_proj",
438
  "model.layers.34.mlp.shared_expert.up_proj",
439
  "model.layers.34.mlp.shared_expert.down_proj",
440
  "model.layers.34.mlp.shared_expert_gate",
441
+ "model.layers.35.self_attn.qkv_proj",
442
  "model.layers.35.self_attn.q_proj",
443
  "model.layers.35.self_attn.k_proj",
444
  "model.layers.35.self_attn.v_proj",
445
  "model.layers.35.self_attn.o_proj",
446
  "model.layers.35.mlp.gate",
447
+ "model.layers.35.mlp.shared_expert.gate_up_proj",
448
  "model.layers.35.mlp.shared_expert.gate_proj",
449
  "model.layers.35.mlp.shared_expert.up_proj",
450
  "model.layers.35.mlp.shared_expert.down_proj",
 
453
  "model.layers.36.linear_attn.in_proj_ba",
454
  "model.layers.36.linear_attn.out_proj",
455
  "model.layers.36.mlp.gate",
456
+ "model.layers.36.mlp.shared_expert.gate_up_proj",
457
  "model.layers.36.mlp.shared_expert.gate_proj",
458
  "model.layers.36.mlp.shared_expert.up_proj",
459
  "model.layers.36.mlp.shared_expert.down_proj",
 
462
  "model.layers.37.linear_attn.in_proj_ba",
463
  "model.layers.37.linear_attn.out_proj",
464
  "model.layers.37.mlp.gate",
465
+ "model.layers.37.mlp.shared_expert.gate_up_proj",
466
  "model.layers.37.mlp.shared_expert.gate_proj",
467
  "model.layers.37.mlp.shared_expert.up_proj",
468
  "model.layers.37.mlp.shared_expert.down_proj",
 
471
  "model.layers.38.linear_attn.in_proj_ba",
472
  "model.layers.38.linear_attn.out_proj",
473
  "model.layers.38.mlp.gate",
474
+ "model.layers.38.mlp.shared_expert.gate_up_proj",
475
  "model.layers.38.mlp.shared_expert.gate_proj",
476
  "model.layers.38.mlp.shared_expert.up_proj",
477
  "model.layers.38.mlp.shared_expert.down_proj",
478
  "model.layers.38.mlp.shared_expert_gate",
479
+ "model.layers.39.self_attn.qkv_proj",
480
  "model.layers.39.self_attn.q_proj",
481
  "model.layers.39.self_attn.k_proj",
482
  "model.layers.39.self_attn.v_proj",
483
  "model.layers.39.self_attn.o_proj",
484
  "model.layers.39.mlp.gate",
485
+ "model.layers.39.mlp.shared_expert.gate_up_proj",
486
  "model.layers.39.mlp.shared_expert.gate_proj",
487
  "model.layers.39.mlp.shared_expert.up_proj",
488
  "model.layers.39.mlp.shared_expert.down_proj",
 
491
  "model.layers.40.linear_attn.in_proj_ba",
492
  "model.layers.40.linear_attn.out_proj",
493
  "model.layers.40.mlp.gate",
494
+ "model.layers.40.mlp.shared_expert.gate_up_proj",
495
  "model.layers.40.mlp.shared_expert.gate_proj",
496
  "model.layers.40.mlp.shared_expert.up_proj",
497
  "model.layers.40.mlp.shared_expert.down_proj",
 
500
  "model.layers.41.linear_attn.in_proj_ba",
501
  "model.layers.41.linear_attn.out_proj",
502
  "model.layers.41.mlp.gate",
503
+ "model.layers.41.mlp.shared_expert.gate_up_proj",
504
  "model.layers.41.mlp.shared_expert.gate_proj",
505
  "model.layers.41.mlp.shared_expert.up_proj",
506
  "model.layers.41.mlp.shared_expert.down_proj",
 
509
  "model.layers.42.linear_attn.in_proj_ba",
510
  "model.layers.42.linear_attn.out_proj",
511
  "model.layers.42.mlp.gate",
512
+ "model.layers.42.mlp.shared_expert.gate_up_proj",
513
  "model.layers.42.mlp.shared_expert.gate_proj",
514
  "model.layers.42.mlp.shared_expert.up_proj",
515
  "model.layers.42.mlp.shared_expert.down_proj",
516
  "model.layers.42.mlp.shared_expert_gate",
517
+ "model.layers.43.self_attn.qkv_proj",
518
  "model.layers.43.self_attn.q_proj",
519
  "model.layers.43.self_attn.k_proj",
520
  "model.layers.43.self_attn.v_proj",
521
  "model.layers.43.self_attn.o_proj",
522
  "model.layers.43.mlp.gate",
523
+ "model.layers.43.mlp.shared_expert.gate_up_proj",
524
  "model.layers.43.mlp.shared_expert.gate_proj",
525
  "model.layers.43.mlp.shared_expert.up_proj",
526
  "model.layers.43.mlp.shared_expert.down_proj",
 
529
  "model.layers.44.linear_attn.in_proj_ba",
530
  "model.layers.44.linear_attn.out_proj",
531
  "model.layers.44.mlp.gate",
532
+ "model.layers.44.mlp.shared_expert.gate_up_proj",
533
  "model.layers.44.mlp.shared_expert.gate_proj",
534
  "model.layers.44.mlp.shared_expert.up_proj",
535
  "model.layers.44.mlp.shared_expert.down_proj",
 
538
  "model.layers.45.linear_attn.in_proj_ba",
539
  "model.layers.45.linear_attn.out_proj",
540
  "model.layers.45.mlp.gate",
541
+ "model.layers.45.mlp.shared_expert.gate_up_proj",
542
  "model.layers.45.mlp.shared_expert.gate_proj",
543
  "model.layers.45.mlp.shared_expert.up_proj",
544
  "model.layers.45.mlp.shared_expert.down_proj",
 
547
  "model.layers.46.linear_attn.in_proj_ba",
548
  "model.layers.46.linear_attn.out_proj",
549
  "model.layers.46.mlp.gate",
550
+ "model.layers.46.mlp.shared_expert.gate_up_proj",
551
  "model.layers.46.mlp.shared_expert.gate_proj",
552
  "model.layers.46.mlp.shared_expert.up_proj",
553
  "model.layers.46.mlp.shared_expert.down_proj",
554
  "model.layers.46.mlp.shared_expert_gate",
555
+ "model.layers.47.self_attn.qkv_proj",
556
  "model.layers.47.self_attn.q_proj",
557
  "model.layers.47.self_attn.k_proj",
558
  "model.layers.47.self_attn.v_proj",
559
  "model.layers.47.self_attn.o_proj",
560
  "model.layers.47.mlp.gate",
561
+ "model.layers.47.mlp.shared_expert.gate_up_proj",
562
  "model.layers.47.mlp.shared_expert.gate_proj",
563
  "model.layers.47.mlp.shared_expert.up_proj",
564
  "model.layers.47.mlp.shared_expert.down_proj",