{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 29.06991988346686, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 43.0, "epoch": 0.011653313911143482, "f1_execute": 0.4545454680919647, "f1_repeat": 0.0, "f1_skip": 0.1666666716337204, "grad_norm": 14.25, "learning_rate": 2e-06, "loss": 1.776, "macro_f1": 0.2070707082748413, "num_tokens": 2751.0, "repeat_count": 0.0, "routers_loss": 1.4875637292861938, "skip_count": 3.0, "step": 2, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 43.0, "epoch": 0.023306627822286964, "f1_execute": 0.4000000059604645, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 14.8125, "learning_rate": 6e-06, "loss": 1.8539, "macro_f1": 0.13333334028720856, "num_tokens": 6134.0, "repeat_count": 0.0, "routers_loss": 1.5653417110443115, "skip_count": 0.0, "step": 4, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 43.0, "epoch": 0.03495994173343044, "f1_execute": 0.4651162326335907, "f1_repeat": 0.2222222238779068, "f1_skip": 0.0, "grad_norm": 16.375, "learning_rate": 1e-05, "loss": 1.8917, "macro_f1": 0.22911283373832703, "num_tokens": 8598.0, "repeat_count": 2.0, "routers_loss": 1.5763707160949707, "skip_count": 2.0, "step": 6, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 40.0, "epoch": 0.04661325564457393, "f1_execute": 0.43478262424468994, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.0625, "learning_rate": 1.4e-05, "loss": 1.8894, "macro_f1": 0.14492754638195038, "num_tokens": 11162.0, "repeat_count": 0.0, "routers_loss": 1.4726613759994507, "skip_count": 0.0, "step": 8, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 40.0, "epoch": 0.05826656955571741, "f1_execute": 0.46511632204055786, "f1_repeat": 0.0, "f1_skip": 0.4285714626312256, "grad_norm": 14.0, "learning_rate": 1.8e-05, "loss": 1.8167, "macro_f1": 0.2978959381580353, "num_tokens": 14221.0, "repeat_count": 0.0, "routers_loss": 1.1233787536621094, "skip_count": 3.0, "step": 10, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 43.0, "epoch": 0.06991988346686089, "f1_execute": 0.4545454680919647, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 14.75, "learning_rate": 2.2e-05, "loss": 1.7943, "macro_f1": 0.1515151560306549, "num_tokens": 16957.0, "repeat_count": 1.0, "routers_loss": 1.6862455606460571, "skip_count": 2.0, "step": 12, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 40.0, "epoch": 0.08157319737800436, "f1_execute": 0.4680851399898529, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 16.375, "learning_rate": 2.6e-05, "loss": 1.9054, "macro_f1": 0.1560283899307251, "num_tokens": 19562.0, "repeat_count": 0.0, "routers_loss": 2.4034464359283447, "skip_count": 1.0, "step": 14, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 42.0, "epoch": 0.09322651128914786, "f1_execute": 0.5306122303009033, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 14.625, "learning_rate": 3e-05, "loss": 1.6203, "macro_f1": 0.17687074840068817, "num_tokens": 22753.0, "repeat_count": 0.0, "routers_loss": 1.5769450664520264, "skip_count": 1.0, "step": 16, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 41.0, "epoch": 0.10487982520029134, "f1_execute": 0.5306122303009033, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.4375, "learning_rate": 3.4000000000000007e-05, "loss": 1.628, "macro_f1": 0.17687074840068817, "num_tokens": 25635.0, "repeat_count": 0.0, "routers_loss": 2.594325542449951, "skip_count": 0.0, "step": 18, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 40.0, "epoch": 0.11653313911143481, "f1_execute": 0.523809552192688, "f1_repeat": 0.125, "f1_skip": 0.2857142984867096, "grad_norm": 13.5625, "learning_rate": 3.8e-05, "loss": 1.589, "macro_f1": 0.3115079402923584, "num_tokens": 29433.0, "repeat_count": 2.0, "routers_loss": 2.065823793411255, "skip_count": 4.0, "step": 20, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.3333333432674408, "avg_layers": 41.0, "epoch": 0.1281864530225783, "f1_execute": 0.5581395030021667, "f1_repeat": 0.11764706671237946, "f1_skip": 0.1666666716337204, "grad_norm": 13.5, "learning_rate": 4.2000000000000004e-05, "loss": 1.529, "macro_f1": 0.28081774711608887, "num_tokens": 32139.0, "repeat_count": 3.0, "routers_loss": 1.5501419305801392, "skip_count": 3.0, "step": 22, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.13983976693372177, "f1_execute": 0.6122449040412903, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 14.8125, "learning_rate": 4.6e-05, "loss": 1.4591, "macro_f1": 0.20408163964748383, "num_tokens": 34766.0, "repeat_count": 1.0, "routers_loss": 1.839440941810608, "skip_count": 1.0, "step": 24, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 40.0, "epoch": 0.15149308084486526, "f1_execute": 0.5600000023841858, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 14.625, "learning_rate": 5e-05, "loss": 1.4013, "macro_f1": 0.18666666746139526, "num_tokens": 37899.0, "repeat_count": 0.0, "routers_loss": 1.2671116590499878, "skip_count": 0.0, "step": 26, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 0.16314639475600873, "f1_execute": 0.5600000023841858, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 14.125, "learning_rate": 5.4e-05, "loss": 1.338, "macro_f1": 0.18666666746139526, "num_tokens": 40506.0, "repeat_count": 0.0, "routers_loss": 1.0261961221694946, "skip_count": 0.0, "step": 28, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 0.17479970866715222, "f1_execute": 0.5714285969734192, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.1875, "learning_rate": 5.800000000000001e-05, "loss": 1.2649, "macro_f1": 0.1904762089252472, "num_tokens": 43670.0, "repeat_count": 1.0, "routers_loss": 0.9930331110954285, "skip_count": 1.0, "step": 30, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 40.0, "epoch": 0.1864530225782957, "f1_execute": 0.6399999856948853, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 13.8125, "learning_rate": 6.2e-05, "loss": 1.1591, "macro_f1": 0.2133333384990692, "num_tokens": 46256.0, "repeat_count": 0.0, "routers_loss": 1.4503824710845947, "skip_count": 2.0, "step": 32, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 41.0, "epoch": 0.19810633648943918, "f1_execute": 0.6792453527450562, "f1_repeat": 0.1538461446762085, "f1_skip": 0.0, "grad_norm": 13.1875, "learning_rate": 6.6e-05, "loss": 0.9382, "macro_f1": 0.2776971757411957, "num_tokens": 48853.0, "repeat_count": 2.0, "routers_loss": 0.9751527905464172, "skip_count": 0.0, "step": 34, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 41.0, "epoch": 0.20975965040058267, "f1_execute": 0.7368420958518982, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.375, "learning_rate": 7.000000000000001e-05, "loss": 0.7937, "macro_f1": 0.24561403691768646, "num_tokens": 51384.0, "repeat_count": 0.0, "routers_loss": 0.9556958079338074, "skip_count": 0.0, "step": 36, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 0.22141296431172613, "f1_execute": 0.8524590730667114, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 10.9375, "learning_rate": 7.4e-05, "loss": 0.5751, "macro_f1": 0.3952641487121582, "num_tokens": 53971.0, "repeat_count": 0.0, "routers_loss": 0.29104703664779663, "skip_count": 2.0, "step": 38, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 0.23306627822286963, "f1_execute": 0.8196721076965332, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.78125, "learning_rate": 7.8e-05, "loss": 0.4263, "macro_f1": 0.2732240557670593, "num_tokens": 56572.0, "repeat_count": 0.0, "routers_loss": 0.3474027216434479, "skip_count": 2.0, "step": 40, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.24471959213401312, "f1_execute": 0.892307698726654, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 8.2e-05, "loss": 0.3828, "macro_f1": 0.2974359095096588, "num_tokens": 59207.0, "repeat_count": 0.0, "routers_loss": 0.18196026980876923, "skip_count": 3.0, "step": 42, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 0.2563729060451566, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.8125, "learning_rate": 8.599999999999999e-05, "loss": 0.2497, "macro_f1": 0.3188405930995941, "num_tokens": 63038.0, "repeat_count": 0.0, "routers_loss": 0.13786140084266663, "skip_count": 0.0, "step": 44, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 0.26802621995630005, "f1_execute": 0.9090908765792847, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.296875, "learning_rate": 8.999999999999999e-05, "loss": 0.1956, "macro_f1": 0.3030303120613098, "num_tokens": 66067.0, "repeat_count": 1.0, "routers_loss": 0.12864550948143005, "skip_count": 0.0, "step": 46, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.27967953386744354, "f1_execute": 0.875, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.625, "learning_rate": 9.400000000000001e-05, "loss": 0.1607, "macro_f1": 0.2916666865348816, "num_tokens": 68365.0, "repeat_count": 4.0, "routers_loss": 0.7522405982017517, "skip_count": 2.0, "step": 48, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.29133284777858703, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 9.800000000000001e-05, "loss": 0.1263, "macro_f1": 0.3333333432674408, "num_tokens": 71179.0, "repeat_count": 0.0, "routers_loss": 0.05093024671077728, "skip_count": 0.0, "step": 50, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.3029861616897305, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.89453125, "learning_rate": 0.000102, "loss": 0.1163, "macro_f1": 0.32380953431129456, "num_tokens": 74207.0, "repeat_count": 0.0, "routers_loss": 0.11771819740533829, "skip_count": 2.0, "step": 52, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.314639475600874, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.90625, "learning_rate": 0.000106, "loss": 0.1795, "macro_f1": 0.32380953431129456, "num_tokens": 77410.0, "repeat_count": 0.0, "routers_loss": 0.081239253282547, "skip_count": 2.0, "step": 54, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.32629278951201746, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3359375, "learning_rate": 0.00011, "loss": 0.1263, "macro_f1": 0.32380953431129456, "num_tokens": 81439.0, "repeat_count": 0.0, "routers_loss": 0.11152500659227371, "skip_count": 2.0, "step": 56, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.33794610342316095, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.96875, "learning_rate": 0.000114, "loss": 0.1362, "macro_f1": 0.3188405930995941, "num_tokens": 84536.0, "repeat_count": 0.0, "routers_loss": 0.19817721843719482, "skip_count": 2.0, "step": 58, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.34959941733430444, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.046875, "learning_rate": 0.000118, "loss": 0.1149, "macro_f1": 0.3188405930995941, "num_tokens": 87368.0, "repeat_count": 0.0, "routers_loss": 0.17378582060337067, "skip_count": 3.0, "step": 60, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.36125273124544793, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.59375, "learning_rate": 0.000122, "loss": 0.1124, "macro_f1": 0.3333333432674408, "num_tokens": 89832.0, "repeat_count": 0.0, "routers_loss": 0.01823478937149048, "skip_count": 0.0, "step": 62, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.3729060451565914, "f1_execute": 0.9253730773925781, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2421875, "learning_rate": 0.000126, "loss": 0.1127, "macro_f1": 0.30845770239830017, "num_tokens": 92262.0, "repeat_count": 2.0, "routers_loss": 0.5476711988449097, "skip_count": 3.0, "step": 64, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 0.38455935906773486, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8984375, "learning_rate": 0.00013000000000000002, "loss": 0.0984, "macro_f1": 0.32380953431129456, "num_tokens": 95164.0, "repeat_count": 0.0, "routers_loss": 0.08861488848924637, "skip_count": 1.0, "step": 66, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.39621267297887836, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1875, "learning_rate": 0.000134, "loss": 0.1651, "macro_f1": 0.3333333432674408, "num_tokens": 97979.0, "repeat_count": 0.0, "routers_loss": 0.0153065025806427, "skip_count": 0.0, "step": 68, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.40786598689002185, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5703125, "learning_rate": 0.00013800000000000002, "loss": 0.1186, "macro_f1": 0.32380953431129456, "num_tokens": 101125.0, "repeat_count": 0.0, "routers_loss": 0.06934976577758789, "skip_count": 2.0, "step": 70, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 0.41951930080116534, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.00014199999999999998, "loss": 0.0999, "macro_f1": 0.3188405930995941, "num_tokens": 104000.0, "repeat_count": 0.0, "routers_loss": 0.07452902942895889, "skip_count": 1.0, "step": 72, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.43117261471230883, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.703125, "learning_rate": 0.000146, "loss": 0.081, "macro_f1": 0.32380953431129456, "num_tokens": 106761.0, "repeat_count": 0.0, "routers_loss": 0.07339581847190857, "skip_count": 2.0, "step": 74, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.44282592862345227, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2265625, "learning_rate": 0.00015, "loss": 0.0937, "macro_f1": 0.3188405930995941, "num_tokens": 109859.0, "repeat_count": 0.0, "routers_loss": 0.16363973915576935, "skip_count": 3.0, "step": 76, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 0.45447924253459576, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.671875, "learning_rate": 0.000154, "loss": 0.1428, "macro_f1": 0.32863849401474, "num_tokens": 113225.0, "repeat_count": 0.0, "routers_loss": 0.02408621832728386, "skip_count": 0.0, "step": 78, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.46613255644573925, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1484375, "learning_rate": 0.000158, "loss": 0.1132, "macro_f1": 0.3333333432674408, "num_tokens": 116303.0, "repeat_count": 0.0, "routers_loss": 0.01130097359418869, "skip_count": 0.0, "step": 80, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.47778587035688275, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.671875, "learning_rate": 0.000162, "loss": 0.1091, "macro_f1": 0.3188405930995941, "num_tokens": 118814.0, "repeat_count": 1.0, "routers_loss": 0.3908933103084564, "skip_count": 2.0, "step": 82, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.48943918426802624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.421875, "learning_rate": 0.00016600000000000002, "loss": 0.1054, "macro_f1": 0.3333333432674408, "num_tokens": 121859.0, "repeat_count": 0.0, "routers_loss": 0.011200646869838238, "skip_count": 0.0, "step": 84, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.5010924981791697, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4765625, "learning_rate": 0.00017, "loss": 0.0952, "macro_f1": 0.32380953431129456, "num_tokens": 124444.0, "repeat_count": 0.0, "routers_loss": 0.07185240834951401, "skip_count": 2.0, "step": 86, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.5127458120903132, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.265625, "learning_rate": 0.000174, "loss": 0.0811, "macro_f1": 0.3333333432674408, "num_tokens": 126920.0, "repeat_count": 0.0, "routers_loss": 0.018316682428121567, "skip_count": 0.0, "step": 88, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.5243991260014567, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.078125, "learning_rate": 0.000178, "loss": 0.0882, "macro_f1": 0.3137255311012268, "num_tokens": 130279.0, "repeat_count": 0.0, "routers_loss": 0.14082899689674377, "skip_count": 3.0, "step": 90, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 0.5360524399126001, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.0625, "learning_rate": 0.000182, "loss": 0.1096, "macro_f1": 0.30845773220062256, "num_tokens": 133002.0, "repeat_count": 2.0, "routers_loss": 0.21677513420581818, "skip_count": 1.0, "step": 92, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.5477057538237436, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3359375, "learning_rate": 0.000186, "loss": 0.1632, "macro_f1": 0.3188405930995941, "num_tokens": 135812.0, "repeat_count": 2.0, "routers_loss": 0.23094841837882996, "skip_count": 0.0, "step": 94, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.5593590677348871, "f1_execute": 0.9090908765792847, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4296875, "learning_rate": 0.00019, "loss": 0.1258, "macro_f1": 0.3030303120613098, "num_tokens": 139987.0, "repeat_count": 1.0, "routers_loss": 0.25298792123794556, "skip_count": 4.0, "step": 96, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.5710123816460306, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.609375, "learning_rate": 0.000194, "loss": 0.1559, "macro_f1": 0.32863849401474, "num_tokens": 142660.0, "repeat_count": 0.0, "routers_loss": 0.042755406349897385, "skip_count": 1.0, "step": 98, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.5826656955571741, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.28125, "learning_rate": 0.00019800000000000002, "loss": 0.0846, "macro_f1": 0.2857142984867096, "num_tokens": 145985.0, "repeat_count": 5.0, "routers_loss": 0.4518720805644989, "skip_count": 3.0, "step": 100, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.5943190094683175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.703125, "learning_rate": 0.000202, "loss": 0.0951, "macro_f1": 0.3333333432674408, "num_tokens": 149070.0, "repeat_count": 0.0, "routers_loss": 0.021079307422041893, "skip_count": 0.0, "step": 102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1640625, "learning_rate": 0.000206, "loss": 0.0948, "macro_f1": 0.3333333432674408, "num_tokens": 151635.0, "repeat_count": 0.0, "routers_loss": 0.01923326589167118, "skip_count": 0.0, "step": 104, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 0.6176256372906045, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.40625, "learning_rate": 0.00021, "loss": 0.2616, "macro_f1": 0.48507463932037354, "num_tokens": 154637.0, "repeat_count": 0.0, "routers_loss": 0.08926121890544891, "skip_count": 3.0, "step": 106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.629278951201748, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.296875, "learning_rate": 0.000214, "loss": 0.0891, "macro_f1": 0.32863849401474, "num_tokens": 157116.0, "repeat_count": 0.0, "routers_loss": 0.05483776330947876, "skip_count": 1.0, "step": 108, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.6409322651128915, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3828125, "learning_rate": 0.000218, "loss": 0.12, "macro_f1": 0.32863849401474, "num_tokens": 159905.0, "repeat_count": 0.0, "routers_loss": 0.04328121244907379, "skip_count": 1.0, "step": 110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.6525855790240349, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.078125, "learning_rate": 0.000222, "loss": 0.0923, "macro_f1": 0.32863849401474, "num_tokens": 162629.0, "repeat_count": 1.0, "routers_loss": 0.09674539417028427, "skip_count": 0.0, "step": 112, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.6642388929351785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2890625, "learning_rate": 0.00022600000000000002, "loss": 0.0999, "macro_f1": 0.3333333432674408, "num_tokens": 165672.0, "repeat_count": 0.0, "routers_loss": 0.013293042778968811, "skip_count": 0.0, "step": 114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.6758922068463219, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.46875, "learning_rate": 0.00023, "loss": 0.1143, "macro_f1": 0.3333333432674408, "num_tokens": 168754.0, "repeat_count": 0.0, "routers_loss": 0.013701070100069046, "skip_count": 0.0, "step": 116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.6875455207574654, "f1_execute": 0.9090908765792847, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5859375, "learning_rate": 0.00023400000000000002, "loss": 0.1051, "macro_f1": 0.3030303120613098, "num_tokens": 171372.0, "repeat_count": 2.0, "routers_loss": 0.25785914063453674, "skip_count": 3.0, "step": 118, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.6991988346686089, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4453125, "learning_rate": 0.00023799999999999998, "loss": 0.0894, "macro_f1": 0.30845773220062256, "num_tokens": 174154.0, "repeat_count": 1.0, "routers_loss": 0.15320169925689697, "skip_count": 3.0, "step": 120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.7108521485797523, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2265625, "learning_rate": 0.000242, "loss": 0.0996, "macro_f1": 0.32863849401474, "num_tokens": 177178.0, "repeat_count": 0.0, "routers_loss": 0.1595502346754074, "skip_count": 1.0, "step": 122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.7225054624908959, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.71875, "learning_rate": 0.000246, "loss": 0.1009, "macro_f1": 0.3333333432674408, "num_tokens": 180410.0, "repeat_count": 0.0, "routers_loss": 0.012225675396621227, "skip_count": 0.0, "step": 124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.7341587764020393, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8828125, "learning_rate": 0.00025, "loss": 0.0949, "macro_f1": 0.3333333432674408, "num_tokens": 183530.0, "repeat_count": 0.0, "routers_loss": 0.012270545586943626, "skip_count": 0.0, "step": 126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.7458120903131829, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.015625, "learning_rate": 0.000254, "loss": 0.0913, "macro_f1": 0.32863849401474, "num_tokens": 186232.0, "repeat_count": 1.0, "routers_loss": 0.04337947070598602, "skip_count": 0.0, "step": 128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.7574654042243263, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.59375, "learning_rate": 0.00025800000000000004, "loss": 0.1199, "macro_f1": 0.32863849401474, "num_tokens": 188976.0, "repeat_count": 0.0, "routers_loss": 0.025307081639766693, "skip_count": 0.0, "step": 130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 0.7691187181354697, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.171875, "learning_rate": 0.000262, "loss": 0.1224, "macro_f1": 0.5507246255874634, "num_tokens": 191795.0, "repeat_count": 0.0, "routers_loss": 0.03186547011137009, "skip_count": 2.0, "step": 132, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 0.7807720320466133, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.75, "learning_rate": 0.000266, "loss": 0.166, "macro_f1": 0.3137255012989044, "num_tokens": 194552.0, "repeat_count": 1.0, "routers_loss": 0.22868163883686066, "skip_count": 1.0, "step": 134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.7924253459577567, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.859375, "learning_rate": 0.00027, "loss": 0.1069, "macro_f1": 0.32863849401474, "num_tokens": 197277.0, "repeat_count": 0.0, "routers_loss": 0.06971826404333115, "skip_count": 1.0, "step": 136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.8040786598689003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.00027400000000000005, "loss": 0.0903, "macro_f1": 0.3333333432674408, "num_tokens": 199699.0, "repeat_count": 0.0, "routers_loss": 0.012532548978924751, "skip_count": 0.0, "step": 138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.8157319737800437, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4140625, "learning_rate": 0.00027800000000000004, "loss": 0.1176, "macro_f1": 0.32380953431129456, "num_tokens": 202832.0, "repeat_count": 0.0, "routers_loss": 0.050830770283937454, "skip_count": 1.0, "step": 140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 35.0, "epoch": 0.8273852876911871, "f1_execute": 0.939393937587738, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.2109375, "learning_rate": 0.00028199999999999997, "loss": 0.0991, "macro_f1": 0.4464646577835083, "num_tokens": 205869.0, "repeat_count": 1.0, "routers_loss": 0.1383591741323471, "skip_count": 4.0, "step": 142, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.8390386016023307, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1015625, "learning_rate": 0.00028599999999999996, "loss": 0.0635, "macro_f1": 0.3333333432674408, "num_tokens": 208840.0, "repeat_count": 0.0, "routers_loss": 0.020324379205703735, "skip_count": 0.0, "step": 144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.8506919155134741, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 0.00029, "loss": 0.1216, "macro_f1": 0.32863849401474, "num_tokens": 211551.0, "repeat_count": 0.0, "routers_loss": 0.048311132937669754, "skip_count": 1.0, "step": 146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.8623452294246177, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.000294, "loss": 0.0986, "macro_f1": 0.32863849401474, "num_tokens": 214961.0, "repeat_count": 0.0, "routers_loss": 0.05390466749668121, "skip_count": 1.0, "step": 148, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.8739985433357611, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3125, "learning_rate": 0.000298, "loss": 0.1522, "macro_f1": 0.3188405930995941, "num_tokens": 217406.0, "repeat_count": 0.0, "routers_loss": 0.11864367127418518, "skip_count": 3.0, "step": 150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.8856518572469045, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.265625, "learning_rate": 0.000302, "loss": 0.107, "macro_f1": 0.32380953431129456, "num_tokens": 221241.0, "repeat_count": 0.0, "routers_loss": 0.11915431916713715, "skip_count": 1.0, "step": 152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.8973051711580481, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.90625, "learning_rate": 0.000306, "loss": 0.1217, "macro_f1": 0.32863849401474, "num_tokens": 223872.0, "repeat_count": 1.0, "routers_loss": 0.0945093184709549, "skip_count": 0.0, "step": 154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.9089584850691915, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.421875, "learning_rate": 0.00031, "loss": 0.1082, "macro_f1": 0.30845773220062256, "num_tokens": 226865.0, "repeat_count": 2.0, "routers_loss": 0.284166544675827, "skip_count": 2.0, "step": 156, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 0.9206117989803351, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3984375, "learning_rate": 0.000314, "loss": 0.1554, "macro_f1": 0.32863849401474, "num_tokens": 229550.0, "repeat_count": 0.0, "routers_loss": 0.022414306178689003, "skip_count": 0.0, "step": 158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.9322651128914785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7109375, "learning_rate": 0.00031800000000000003, "loss": 0.1307, "macro_f1": 0.3333333432674408, "num_tokens": 232907.0, "repeat_count": 0.0, "routers_loss": 0.02432156354188919, "skip_count": 0.0, "step": 160, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 0.943918426802622, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7109375, "learning_rate": 0.000322, "loss": 0.1132, "macro_f1": 0.3188405930995941, "num_tokens": 235465.0, "repeat_count": 1.0, "routers_loss": 0.10865183919668198, "skip_count": 1.0, "step": 162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.9555717407137655, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.046875, "learning_rate": 0.000326, "loss": 0.0643, "macro_f1": 0.32863849401474, "num_tokens": 238585.0, "repeat_count": 0.0, "routers_loss": 0.07029256969690323, "skip_count": 1.0, "step": 164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.9672250546249089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5546875, "learning_rate": 0.00033, "loss": 0.1292, "macro_f1": 0.3333333432674408, "num_tokens": 241381.0, "repeat_count": 0.0, "routers_loss": 0.00908057950437069, "skip_count": 0.0, "step": 166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 0.9788783685360525, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4296875, "learning_rate": 0.00033400000000000004, "loss": 0.1316, "macro_f1": 0.32863849401474, "num_tokens": 245288.0, "repeat_count": 1.0, "routers_loss": 0.10059425234794617, "skip_count": 0.0, "step": 168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 0.9905316824471959, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.1953125, "learning_rate": 0.00033800000000000003, "loss": 0.1043, "macro_f1": 0.5507246255874634, "num_tokens": 248732.0, "repeat_count": 0.0, "routers_loss": 0.0744163990020752, "skip_count": 2.0, "step": 170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.0, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.484375, "learning_rate": 0.000342, "loss": 0.1714, "macro_f1": 0.3188405930995941, "num_tokens": 251216.0, "repeat_count": 0.0, "routers_loss": 0.17273971438407898, "skip_count": 2.0, "step": 172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 1.0116533139111434, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.328125, "learning_rate": 0.000346, "loss": 0.0935, "macro_f1": 0.5507246255874634, "num_tokens": 253944.0, "repeat_count": 0.0, "routers_loss": 0.030687833204865456, "skip_count": 2.0, "step": 174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 1.0233066278222869, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8125, "learning_rate": 0.00035, "loss": 0.086, "macro_f1": 0.3188405930995941, "num_tokens": 256440.0, "repeat_count": 0.0, "routers_loss": 0.07829566299915314, "skip_count": 2.0, "step": 176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 1.0349599417334305, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.2734375, "learning_rate": 0.000354, "loss": 0.0916, "macro_f1": 0.6666666865348816, "num_tokens": 258991.0, "repeat_count": 0.0, "routers_loss": 0.0218886099755764, "skip_count": 1.0, "step": 178, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 1.046613255644574, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4453125, "learning_rate": 0.000358, "loss": 0.1149, "macro_f1": 0.3137255311012268, "num_tokens": 261870.0, "repeat_count": 0.0, "routers_loss": 0.08019441366195679, "skip_count": 1.0, "step": 180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.0582665695557174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.296875, "learning_rate": 0.000362, "loss": 0.1042, "macro_f1": 0.3333333432674408, "num_tokens": 264666.0, "repeat_count": 0.0, "routers_loss": 0.015607878565788269, "skip_count": 0.0, "step": 182, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 1.0699198834668608, "f1_execute": 0.939393937587738, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.4453125, "learning_rate": 0.000366, "loss": 0.0782, "macro_f1": 0.47979801893234253, "num_tokens": 267231.0, "repeat_count": 2.0, "routers_loss": 0.24204856157302856, "skip_count": 3.0, "step": 184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.0815731973780043, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.00037, "loss": 0.0826, "macro_f1": 0.32863849401474, "num_tokens": 269812.0, "repeat_count": 1.0, "routers_loss": 0.10646804422140121, "skip_count": 0.0, "step": 186, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.093226511289148, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.671875, "learning_rate": 0.000374, "loss": 0.1007, "macro_f1": 0.32380953431129456, "num_tokens": 272607.0, "repeat_count": 0.0, "routers_loss": 0.0516185536980629, "skip_count": 1.0, "step": 188, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 1.1048798252002914, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.09375, "learning_rate": 0.000378, "loss": 0.1037, "macro_f1": 0.48507463932037354, "num_tokens": 274911.0, "repeat_count": 1.0, "routers_loss": 0.1002965122461319, "skip_count": 3.0, "step": 190, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.1165331391114348, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.03125, "learning_rate": 0.000382, "loss": 0.0638, "macro_f1": 0.32863849401474, "num_tokens": 278190.0, "repeat_count": 0.0, "routers_loss": 0.01686920039355755, "skip_count": 0.0, "step": 192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.1281864530225783, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.140625, "learning_rate": 0.000386, "loss": 0.0878, "macro_f1": 0.32863849401474, "num_tokens": 280662.0, "repeat_count": 0.0, "routers_loss": 0.11285223066806793, "skip_count": 1.0, "step": 194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.1398397669337217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9453125, "learning_rate": 0.00039000000000000005, "loss": 0.0592, "macro_f1": 0.3333333432674408, "num_tokens": 283553.0, "repeat_count": 0.0, "routers_loss": 0.006507783196866512, "skip_count": 0.0, "step": 196, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.1514930808448653, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.53125, "learning_rate": 0.00039400000000000004, "loss": 0.1178, "macro_f1": 0.3333333432674408, "num_tokens": 286489.0, "repeat_count": 0.0, "routers_loss": 0.007027088198810816, "skip_count": 0.0, "step": 198, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.1631463947560088, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.25, "learning_rate": 0.000398, "loss": 0.1027, "macro_f1": 0.3188405930995941, "num_tokens": 289119.0, "repeat_count": 1.0, "routers_loss": 0.09960237145423889, "skip_count": 2.0, "step": 200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 1.1747997086671522, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.40625, "learning_rate": 0.000402, "loss": 0.0869, "macro_f1": 0.5507246255874634, "num_tokens": 291851.0, "repeat_count": 0.0, "routers_loss": 0.029848098754882812, "skip_count": 2.0, "step": 202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 1.1864530225782957, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.1171875, "learning_rate": 0.00040600000000000006, "loss": 0.0842, "macro_f1": 0.4901960790157318, "num_tokens": 294500.0, "repeat_count": 0.0, "routers_loss": 0.0354730449616909, "skip_count": 2.0, "step": 204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 36.0, "epoch": 1.198106336489439, "f1_execute": 0.9230769872665405, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 1.1875, "learning_rate": 0.00041, "loss": 0.0826, "macro_f1": 0.41880345344543457, "num_tokens": 297328.0, "repeat_count": 0.0, "routers_loss": 0.2616942822933197, "skip_count": 5.0, "step": 206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.2097596504005828, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.609375, "learning_rate": 0.000414, "loss": 0.1513, "macro_f1": 0.32380953431129456, "num_tokens": 301162.0, "repeat_count": 0.0, "routers_loss": 0.47280120849609375, "skip_count": 2.0, "step": 208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.2214129643117262, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.390625, "learning_rate": 0.00041799999999999997, "loss": 0.0705, "macro_f1": 0.3188405930995941, "num_tokens": 303873.0, "repeat_count": 1.0, "routers_loss": 0.153229758143425, "skip_count": 2.0, "step": 210, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.2330662782228696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.21875, "learning_rate": 0.000422, "loss": 0.1448, "macro_f1": 0.3333333432674408, "num_tokens": 307653.0, "repeat_count": 0.0, "routers_loss": 0.003971161786466837, "skip_count": 0.0, "step": 212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8046875, "learning_rate": 0.000426, "loss": 0.0793, "macro_f1": 0.3333333432674408, "num_tokens": 310738.0, "repeat_count": 0.0, "routers_loss": 0.013937443494796753, "skip_count": 0.0, "step": 214, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.2563729060451565, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0390625, "learning_rate": 0.00043, "loss": 0.089, "macro_f1": 0.3333333432674408, "num_tokens": 313707.0, "repeat_count": 0.0, "routers_loss": 0.02097656950354576, "skip_count": 0.0, "step": 216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.2680262199563002, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2421875, "learning_rate": 0.00043400000000000003, "loss": 0.1342, "macro_f1": 0.32863849401474, "num_tokens": 316223.0, "repeat_count": 0.0, "routers_loss": 0.011317480355501175, "skip_count": 0.0, "step": 218, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.2796795338674436, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.91796875, "learning_rate": 0.000438, "loss": 0.109, "macro_f1": 0.32863849401474, "num_tokens": 318911.0, "repeat_count": 0.0, "routers_loss": 0.08032534271478653, "skip_count": 1.0, "step": 220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 1.291332847778587, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.3125, "learning_rate": 0.000442, "loss": 0.1109, "macro_f1": 0.5507246255874634, "num_tokens": 321843.0, "repeat_count": 0.0, "routers_loss": 0.049881525337696075, "skip_count": 2.0, "step": 222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.3029861616897305, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5390625, "learning_rate": 0.000446, "loss": 0.1304, "macro_f1": 0.3333333432674408, "num_tokens": 324679.0, "repeat_count": 0.0, "routers_loss": 0.010786174796521664, "skip_count": 0.0, "step": 224, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4453125, "learning_rate": 0.00045000000000000004, "loss": 0.1039, "macro_f1": 0.3333333432674408, "num_tokens": 327623.0, "repeat_count": 0.0, "routers_loss": 0.01675771363079548, "skip_count": 0.0, "step": 226, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.3262927895120176, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.53125, "learning_rate": 0.00045400000000000003, "loss": 0.0861, "macro_f1": 0.30845773220062256, "num_tokens": 330312.0, "repeat_count": 2.0, "routers_loss": 0.2814704477787018, "skip_count": 2.0, "step": 228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.337946103423161, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0546875, "learning_rate": 0.000458, "loss": 0.0885, "macro_f1": 0.32380953431129456, "num_tokens": 333437.0, "repeat_count": 0.0, "routers_loss": 0.03737984597682953, "skip_count": 2.0, "step": 230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.3495994173343044, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.69140625, "learning_rate": 0.000462, "loss": 0.0651, "macro_f1": 0.32863849401474, "num_tokens": 336753.0, "repeat_count": 0.0, "routers_loss": 0.01981016993522644, "skip_count": 1.0, "step": 232, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.3612527312454479, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.71484375, "learning_rate": 0.00046600000000000005, "loss": 0.1104, "macro_f1": 0.3333333432674408, "num_tokens": 339466.0, "repeat_count": 0.0, "routers_loss": 0.012107688002288342, "skip_count": 0.0, "step": 234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.3729060451565913, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9921875, "learning_rate": 0.00047, "loss": 0.0951, "macro_f1": 0.3333333432674408, "num_tokens": 342366.0, "repeat_count": 0.0, "routers_loss": 0.0070621841587126255, "skip_count": 0.0, "step": 236, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.384559359067735, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.15625, "learning_rate": 0.000474, "loss": 0.123, "macro_f1": 0.32863849401474, "num_tokens": 345017.0, "repeat_count": 1.0, "routers_loss": 0.1668872833251953, "skip_count": 0.0, "step": 238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.3962126729788784, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.71875, "learning_rate": 0.00047799999999999996, "loss": 0.1152, "macro_f1": 0.3333333432674408, "num_tokens": 348080.0, "repeat_count": 0.0, "routers_loss": 0.007294898387044668, "skip_count": 0.0, "step": 240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.4078659868900218, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.53125, "learning_rate": 0.000482, "loss": 0.114, "macro_f1": 0.3333333432674408, "num_tokens": 350944.0, "repeat_count": 0.0, "routers_loss": 0.010465127415955067, "skip_count": 0.0, "step": 242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.4195193008011653, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.59375, "learning_rate": 0.000486, "loss": 0.1306, "macro_f1": 0.3188405930995941, "num_tokens": 354041.0, "repeat_count": 0.0, "routers_loss": 0.10580423474311829, "skip_count": 1.0, "step": 244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 1.4311726147123087, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4921875, "learning_rate": 0.00049, "loss": 0.1, "macro_f1": 0.30845773220062256, "num_tokens": 357179.0, "repeat_count": 1.0, "routers_loss": 0.2848719656467438, "skip_count": 3.0, "step": 246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.4428259286234524, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1640625, "learning_rate": 0.000494, "loss": 0.1092, "macro_f1": 0.3333333432674408, "num_tokens": 360035.0, "repeat_count": 0.0, "routers_loss": 0.009623157791793346, "skip_count": 0.0, "step": 248, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.4544792425345958, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3671875, "learning_rate": 0.000498, "loss": 0.0844, "macro_f1": 0.3333333432674408, "num_tokens": 362798.0, "repeat_count": 0.0, "routers_loss": 0.00488561624661088, "skip_count": 0.0, "step": 250, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.4661325564457393, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4296875, "learning_rate": 0.0005020000000000001, "loss": 0.096, "macro_f1": 0.32380953431129456, "num_tokens": 365550.0, "repeat_count": 0.0, "routers_loss": 0.0987156555056572, "skip_count": 1.0, "step": 252, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.4777858703568827, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.92578125, "learning_rate": 0.000506, "loss": 0.0807, "macro_f1": 0.32863849401474, "num_tokens": 368762.0, "repeat_count": 1.0, "routers_loss": 0.06290630251169205, "skip_count": 0.0, "step": 254, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.4894391842680261, "f1_execute": 0.9090908765792847, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.421875, "learning_rate": 0.00051, "loss": 0.0979, "macro_f1": 0.3030303120613098, "num_tokens": 371893.0, "repeat_count": 1.0, "routers_loss": 0.410902738571167, "skip_count": 4.0, "step": 256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.5010924981791698, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.9765625, "learning_rate": 0.000514, "loss": 0.1763, "macro_f1": 0.3188405930995941, "num_tokens": 374445.0, "repeat_count": 0.0, "routers_loss": 0.12178002297878265, "skip_count": 2.0, "step": 258, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 1.5127458120903132, "f1_execute": 0.9090909361839294, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.78125, "learning_rate": 0.000518, "loss": 0.1472, "macro_f1": 0.3030303120613098, "num_tokens": 377471.0, "repeat_count": 3.0, "routers_loss": 0.23162205517292023, "skip_count": 0.0, "step": 260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.5243991260014567, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5234375, "learning_rate": 0.000522, "loss": 0.1019, "macro_f1": 0.32863849401474, "num_tokens": 380070.0, "repeat_count": 0.0, "routers_loss": 0.0405697375535965, "skip_count": 0.0, "step": 262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.5360524399126, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.953125, "learning_rate": 0.000526, "loss": 0.1092, "macro_f1": 0.32863849401474, "num_tokens": 382776.0, "repeat_count": 0.0, "routers_loss": 0.030989207327365875, "skip_count": 1.0, "step": 264, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.5477057538237435, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6171875, "learning_rate": 0.0005300000000000001, "loss": 0.0861, "macro_f1": 0.32380953431129456, "num_tokens": 385579.0, "repeat_count": 1.0, "routers_loss": 0.060782890766859055, "skip_count": 0.0, "step": 266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.5593590677348872, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1875, "learning_rate": 0.0005340000000000001, "loss": 0.1278, "macro_f1": 0.32863849401474, "num_tokens": 388087.0, "repeat_count": 1.0, "routers_loss": 0.16287492215633392, "skip_count": 0.0, "step": 268, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.5710123816460306, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6484375, "learning_rate": 0.0005380000000000001, "loss": 0.1122, "macro_f1": 0.3333333432674408, "num_tokens": 391234.0, "repeat_count": 0.0, "routers_loss": 0.012093812227249146, "skip_count": 0.0, "step": 270, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 1.582665695557174, "f1_execute": 0.9552239179611206, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.3046875, "learning_rate": 0.0005420000000000001, "loss": 0.091, "macro_f1": 0.6517413258552551, "num_tokens": 394546.0, "repeat_count": 1.0, "routers_loss": 0.10584304481744766, "skip_count": 3.0, "step": 272, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.5943190094683175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2265625, "learning_rate": 0.000546, "loss": 0.1365, "macro_f1": 0.3333333432674408, "num_tokens": 397494.0, "repeat_count": 0.0, "routers_loss": 0.010192637331783772, "skip_count": 0.0, "step": 274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.605972323379461, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.640625, "learning_rate": 0.00055, "loss": 0.1489, "macro_f1": 0.32863849401474, "num_tokens": 400124.0, "repeat_count": 0.0, "routers_loss": 0.16083182394504547, "skip_count": 1.0, "step": 276, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 1.6176256372906046, "f1_execute": 0.8666666150093079, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 1.203125, "learning_rate": 0.000554, "loss": 0.0904, "macro_f1": 0.4555555284023285, "num_tokens": 403304.0, "repeat_count": 1.0, "routers_loss": 0.19354979693889618, "skip_count": 7.0, "step": 278, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.629278951201748, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.78125, "learning_rate": 0.000558, "loss": 0.1238, "macro_f1": 0.3188405930995941, "num_tokens": 405807.0, "repeat_count": 1.0, "routers_loss": 0.14750653505325317, "skip_count": 2.0, "step": 280, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.6409322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.921875, "learning_rate": 0.0005620000000000001, "loss": 0.0941, "macro_f1": 0.3333333432674408, "num_tokens": 408488.0, "repeat_count": 0.0, "routers_loss": 0.02661188691854477, "skip_count": 0.0, "step": 282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.652585579024035, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.390625, "learning_rate": 0.000566, "loss": 0.1662, "macro_f1": 0.3188405930995941, "num_tokens": 411130.0, "repeat_count": 0.0, "routers_loss": 0.11957119405269623, "skip_count": 3.0, "step": 284, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 1.6642388929351783, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9453125, "learning_rate": 0.00057, "loss": 0.1171, "macro_f1": 0.3137255311012268, "num_tokens": 414095.0, "repeat_count": 0.0, "routers_loss": 0.22778701782226562, "skip_count": 3.0, "step": 286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 1.675892206846322, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.9375, "learning_rate": 0.000574, "loss": 0.0912, "macro_f1": 0.5507246255874634, "num_tokens": 417235.0, "repeat_count": 0.0, "routers_loss": 0.06726544350385666, "skip_count": 2.0, "step": 288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.6875455207574654, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0703125, "learning_rate": 0.000578, "loss": 0.1117, "macro_f1": 0.3333333432674408, "num_tokens": 420901.0, "repeat_count": 0.0, "routers_loss": 0.005258599296212196, "skip_count": 0.0, "step": 290, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 33.0, "epoch": 1.6991988346686089, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 1.5, "learning_rate": 0.0005819999999999999, "loss": 0.139, "macro_f1": 0.4188034236431122, "num_tokens": 423714.0, "repeat_count": 1.0, "routers_loss": 0.4013141393661499, "skip_count": 3.0, "step": 292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7108521485797523, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.390625, "learning_rate": 0.0005859999999999999, "loss": 0.1485, "macro_f1": 0.32863849401474, "num_tokens": 426274.0, "repeat_count": 0.0, "routers_loss": 0.10371402651071548, "skip_count": 1.0, "step": 294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7225054624908958, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.03125, "learning_rate": 0.00059, "loss": 0.1474, "macro_f1": 0.32380953431129456, "num_tokens": 429338.0, "repeat_count": 0.0, "routers_loss": 0.11243436485528946, "skip_count": 2.0, "step": 296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7341587764020394, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.34375, "learning_rate": 0.000594, "loss": 0.1154, "macro_f1": 0.3333333432674408, "num_tokens": 432023.0, "repeat_count": 0.0, "routers_loss": 0.010331062600016594, "skip_count": 0.0, "step": 298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7458120903131829, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6953125, "learning_rate": 0.000598, "loss": 0.1389, "macro_f1": 0.32863849401474, "num_tokens": 435090.0, "repeat_count": 1.0, "routers_loss": 0.12604473531246185, "skip_count": 0.0, "step": 300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7574654042243263, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1875, "learning_rate": 0.000602, "loss": 0.0884, "macro_f1": 0.3333333432674408, "num_tokens": 439303.0, "repeat_count": 0.0, "routers_loss": 0.01306444313377142, "skip_count": 0.0, "step": 302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.7691187181354697, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.703125, "learning_rate": 0.000606, "loss": 0.106, "macro_f1": 0.3137255311012268, "num_tokens": 442281.0, "repeat_count": 1.0, "routers_loss": 0.18563103675842285, "skip_count": 2.0, "step": 304, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7807720320466132, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.00061, "loss": 0.0759, "macro_f1": 0.3333333432674408, "num_tokens": 445238.0, "repeat_count": 0.0, "routers_loss": 0.006393928546458483, "skip_count": 0.0, "step": 306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.7924253459577568, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1953125, "learning_rate": 0.000614, "loss": 0.1088, "macro_f1": 0.32380953431129456, "num_tokens": 447720.0, "repeat_count": 0.0, "routers_loss": 0.253178209066391, "skip_count": 2.0, "step": 308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.8040786598689003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.375, "learning_rate": 0.0006180000000000001, "loss": 0.1711, "macro_f1": 0.3333333432674408, "num_tokens": 450432.0, "repeat_count": 0.0, "routers_loss": 0.007134648039937019, "skip_count": 0.0, "step": 310, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.8157319737800437, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1875, "learning_rate": 0.000622, "loss": 0.1074, "macro_f1": 0.32863849401474, "num_tokens": 453667.0, "repeat_count": 0.0, "routers_loss": 0.018660295754671097, "skip_count": 0.0, "step": 312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 1.8273852876911871, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.046875, "learning_rate": 0.000626, "loss": 0.1945, "macro_f1": 0.4901960790157318, "num_tokens": 456156.0, "repeat_count": 0.0, "routers_loss": 0.044440194964408875, "skip_count": 2.0, "step": 314, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.8390386016023306, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.828125, "learning_rate": 0.00063, "loss": 0.1569, "macro_f1": 0.32863849401474, "num_tokens": 459206.0, "repeat_count": 1.0, "routers_loss": 0.07791463285684586, "skip_count": 0.0, "step": 316, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 1.8506919155134742, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.328125, "learning_rate": 0.000634, "loss": 0.1524, "macro_f1": 0.32380953431129456, "num_tokens": 462112.0, "repeat_count": 0.0, "routers_loss": 0.08156055212020874, "skip_count": 0.0, "step": 318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 1.8623452294246177, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6875, "learning_rate": 0.000638, "loss": 0.1699, "macro_f1": 0.32380953431129456, "num_tokens": 465863.0, "repeat_count": 0.0, "routers_loss": 0.024970415979623795, "skip_count": 0.0, "step": 320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 1.873998543335761, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2578125, "learning_rate": 0.000642, "loss": 0.1407, "macro_f1": 0.30845773220062256, "num_tokens": 468735.0, "repeat_count": 0.0, "routers_loss": 0.11067734658718109, "skip_count": 3.0, "step": 322, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.8856518572469045, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 0.000646, "loss": 0.1308, "macro_f1": 0.32380953431129456, "num_tokens": 471797.0, "repeat_count": 1.0, "routers_loss": 0.16829438507556915, "skip_count": 1.0, "step": 324, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.897305171158048, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.078125, "learning_rate": 0.0006500000000000001, "loss": 0.1307, "macro_f1": 0.3188405930995941, "num_tokens": 474366.0, "repeat_count": 1.0, "routers_loss": 0.12972968816757202, "skip_count": 2.0, "step": 326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.9089584850691916, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8515625, "learning_rate": 0.0006540000000000001, "loss": 0.1021, "macro_f1": 0.32863849401474, "num_tokens": 477387.0, "repeat_count": 0.0, "routers_loss": 0.05748067423701286, "skip_count": 0.0, "step": 328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.920611798980335, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.15625, "learning_rate": 0.0006580000000000001, "loss": 0.1373, "macro_f1": 0.32380953431129456, "num_tokens": 481261.0, "repeat_count": 1.0, "routers_loss": 0.4162489175796509, "skip_count": 1.0, "step": 330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 1.9322651128914785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9765625, "learning_rate": 0.000662, "loss": 0.0976, "macro_f1": 0.3333333432674408, "num_tokens": 484865.0, "repeat_count": 0.0, "routers_loss": 0.02795979753136635, "skip_count": 0.0, "step": 332, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.943918426802622, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.000666, "loss": 0.1228, "macro_f1": 0.32863849401474, "num_tokens": 488722.0, "repeat_count": 0.0, "routers_loss": 0.014703518711030483, "skip_count": 0.0, "step": 334, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.9555717407137654, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5859375, "learning_rate": 0.00067, "loss": 0.116, "macro_f1": 0.32380953431129456, "num_tokens": 491431.0, "repeat_count": 0.0, "routers_loss": 0.061526842415332794, "skip_count": 1.0, "step": 336, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 1.967225054624909, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.953125, "learning_rate": 0.000674, "loss": 0.0992, "macro_f1": 0.32863849401474, "num_tokens": 494239.0, "repeat_count": 0.0, "routers_loss": 0.018712693825364113, "skip_count": 0.0, "step": 338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 1.9788783685360525, "f1_execute": 0.8387096524238586, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5078125, "learning_rate": 0.0006780000000000001, "loss": 0.1403, "macro_f1": 0.2795698940753937, "num_tokens": 496807.0, "repeat_count": 4.0, "routers_loss": 1.0350687503814697, "skip_count": 4.0, "step": 340, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 1.990531682447196, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8125, "learning_rate": 0.0006820000000000001, "loss": 0.1431, "macro_f1": 0.30845773220062256, "num_tokens": 499558.0, "repeat_count": 1.0, "routers_loss": 0.5238631963729858, "skip_count": 3.0, "step": 342, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 34.0, "epoch": 2.0, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.2109375, "learning_rate": 0.0006860000000000001, "loss": 0.1037, "macro_f1": 0.4517413079738617, "num_tokens": 502432.0, "repeat_count": 0.0, "routers_loss": 0.07342097163200378, "skip_count": 3.0, "step": 344, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.0116533139111437, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.86328125, "learning_rate": 0.00069, "loss": 0.0697, "macro_f1": 0.3333333432674408, "num_tokens": 505288.0, "repeat_count": 0.0, "routers_loss": 0.003549730172380805, "skip_count": 0.0, "step": 346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 2.023306627822287, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.1640625, "learning_rate": 0.000694, "loss": 0.0977, "macro_f1": 0.48507463932037354, "num_tokens": 508123.0, "repeat_count": 1.0, "routers_loss": 0.11794813722372055, "skip_count": 2.0, "step": 348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 2.0349599417334305, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.1875, "learning_rate": 0.0006979999999999999, "loss": 0.0887, "macro_f1": 0.5507246255874634, "num_tokens": 511648.0, "repeat_count": 0.0, "routers_loss": 0.052432455122470856, "skip_count": 2.0, "step": 350, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.0466132556445737, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.375, "learning_rate": 0.0007019999999999999, "loss": 0.0928, "macro_f1": 0.3188405930995941, "num_tokens": 514367.0, "repeat_count": 0.0, "routers_loss": 0.09224103391170502, "skip_count": 2.0, "step": 352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 2.0582665695557174, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.203125, "learning_rate": 0.0007059999999999999, "loss": 0.077, "macro_f1": 0.4901960790157318, "num_tokens": 517676.0, "repeat_count": 0.0, "routers_loss": 0.16971060633659363, "skip_count": 2.0, "step": 354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.069919883466861, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.34375, "learning_rate": 0.00071, "loss": 0.0954, "macro_f1": 0.32863849401474, "num_tokens": 520440.0, "repeat_count": 0.0, "routers_loss": 0.1069713607430458, "skip_count": 0.0, "step": 356, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 2.0815731973780043, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.875, "learning_rate": 0.000714, "loss": 0.0648, "macro_f1": 0.3137255012989044, "num_tokens": 523224.0, "repeat_count": 0.0, "routers_loss": 0.2281614989042282, "skip_count": 2.0, "step": 358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.093226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.83203125, "learning_rate": 0.000718, "loss": 0.0725, "macro_f1": 0.3333333432674408, "num_tokens": 525969.0, "repeat_count": 0.0, "routers_loss": 0.0074044931679964066, "skip_count": 0.0, "step": 360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.104879825200291, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.000722, "loss": 0.0741, "macro_f1": 0.32380953431129456, "num_tokens": 528771.0, "repeat_count": 0.0, "routers_loss": 0.05318116024136543, "skip_count": 2.0, "step": 362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 2.116533139111435, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.0625, "learning_rate": 0.000726, "loss": 0.1083, "macro_f1": 0.4901960790157318, "num_tokens": 531448.0, "repeat_count": 0.0, "routers_loss": 0.10283385217189789, "skip_count": 3.0, "step": 364, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.1281864530225785, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.71875, "learning_rate": 0.00073, "loss": 0.0956, "macro_f1": 0.3188405930995941, "num_tokens": 534124.0, "repeat_count": 0.0, "routers_loss": 0.23495350778102875, "skip_count": 2.0, "step": 366, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.20000000298023224, "avg_layers": 38.0, "epoch": 2.1398397669337217, "f1_execute": 0.9032257795333862, "f1_repeat": 0.5, "f1_skip": 0.3333333134651184, "grad_norm": 1.125, "learning_rate": 0.000734, "loss": 0.0653, "macro_f1": 0.5788530707359314, "num_tokens": 536800.0, "repeat_count": 1.0, "routers_loss": 0.2553648352622986, "skip_count": 5.0, "step": 368, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.1514930808448653, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6640625, "learning_rate": 0.000738, "loss": 0.1199, "macro_f1": 0.3333333432674408, "num_tokens": 539480.0, "repeat_count": 0.0, "routers_loss": 0.017564471811056137, "skip_count": 0.0, "step": 370, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 2.1631463947560086, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.3515625, "learning_rate": 0.000742, "loss": 0.0785, "macro_f1": 0.5507246255874634, "num_tokens": 542281.0, "repeat_count": 0.0, "routers_loss": 0.0466945543885231, "skip_count": 2.0, "step": 372, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 2.174799708667152, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9140625, "learning_rate": 0.000746, "loss": 0.0921, "macro_f1": 0.3285024166107178, "num_tokens": 545086.0, "repeat_count": 0.0, "routers_loss": 0.056465521454811096, "skip_count": 2.0, "step": 374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.186453022578296, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.140625, "learning_rate": 0.00075, "loss": 0.0821, "macro_f1": 0.3188405930995941, "num_tokens": 548115.0, "repeat_count": 0.0, "routers_loss": 0.06476357579231262, "skip_count": 2.0, "step": 376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.198106336489439, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.15625, "learning_rate": 0.000754, "loss": 0.0807, "macro_f1": 0.32863849401474, "num_tokens": 552759.0, "repeat_count": 0.0, "routers_loss": 0.08218491077423096, "skip_count": 1.0, "step": 378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.2097596504005828, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.84765625, "learning_rate": 0.000758, "loss": 0.0785, "macro_f1": 0.3333333432674408, "num_tokens": 556082.0, "repeat_count": 0.0, "routers_loss": 0.007066857535392046, "skip_count": 0.0, "step": 380, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.221412964311726, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2109375, "learning_rate": 0.000762, "loss": 0.1064, "macro_f1": 0.32863849401474, "num_tokens": 558743.0, "repeat_count": 0.0, "routers_loss": 0.0755065307021141, "skip_count": 1.0, "step": 382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.2330662782228696, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.40625, "learning_rate": 0.0007660000000000001, "loss": 0.12, "macro_f1": 0.3333333432674408, "num_tokens": 561184.0, "repeat_count": 0.0, "routers_loss": 0.0050592487677931786, "skip_count": 0.0, "step": 384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 2.2447195921340133, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.1796875, "learning_rate": 0.0007700000000000001, "loss": 0.1174, "macro_f1": 0.4901960790157318, "num_tokens": 563754.0, "repeat_count": 0.0, "routers_loss": 0.10797809064388275, "skip_count": 3.0, "step": 386, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 2.2563729060451565, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.0234375, "learning_rate": 0.0007740000000000001, "loss": 0.1094, "macro_f1": 0.5507246255874634, "num_tokens": 567142.0, "repeat_count": 0.0, "routers_loss": 0.0561814047396183, "skip_count": 2.0, "step": 388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.2680262199563, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.09375, "learning_rate": 0.000778, "loss": 0.1283, "macro_f1": 0.32863849401474, "num_tokens": 569877.0, "repeat_count": 0.0, "routers_loss": 0.012143422849476337, "skip_count": 0.0, "step": 390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 2.2796795338674434, "f1_execute": 0.9090909361839294, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.90234375, "learning_rate": 0.000782, "loss": 0.0737, "macro_f1": 0.3030303120613098, "num_tokens": 573157.0, "repeat_count": 1.0, "routers_loss": 0.09327229857444763, "skip_count": 2.0, "step": 392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.291332847778587, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2265625, "learning_rate": 0.000786, "loss": 0.0921, "macro_f1": 0.32863849401474, "num_tokens": 575657.0, "repeat_count": 0.0, "routers_loss": 0.729587733745575, "skip_count": 1.0, "step": 394, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.3029861616897307, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.76171875, "learning_rate": 0.00079, "loss": 0.1362, "macro_f1": 0.32380953431129456, "num_tokens": 578615.0, "repeat_count": 0.0, "routers_loss": 0.08575001358985901, "skip_count": 1.0, "step": 396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.314639475600874, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3515625, "learning_rate": 0.0007940000000000001, "loss": 0.1615, "macro_f1": 0.3137255311012268, "num_tokens": 581484.0, "repeat_count": 1.0, "routers_loss": 0.25073954463005066, "skip_count": 3.0, "step": 398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.3262927895120176, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.0007980000000000001, "loss": 0.0666, "macro_f1": 0.32863849401474, "num_tokens": 584449.0, "repeat_count": 1.0, "routers_loss": 0.1658782958984375, "skip_count": 0.0, "step": 400, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.337946103423161, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.25, "learning_rate": 0.0008020000000000001, "loss": 0.071, "macro_f1": 0.3188405930995941, "num_tokens": 588061.0, "repeat_count": 0.0, "routers_loss": 0.22159621119499207, "skip_count": 2.0, "step": 402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 2.3495994173343044, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.96875, "learning_rate": 0.0008060000000000001, "loss": 0.1115, "macro_f1": 0.3137255012989044, "num_tokens": 590862.0, "repeat_count": 0.0, "routers_loss": 0.21289534866809845, "skip_count": 2.0, "step": 404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.361252731245448, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2109375, "learning_rate": 0.0008100000000000001, "loss": 0.1285, "macro_f1": 0.32863849401474, "num_tokens": 593429.0, "repeat_count": 1.0, "routers_loss": 0.050796009600162506, "skip_count": 0.0, "step": 406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 2.3729060451565913, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.73828125, "learning_rate": 0.0008139999999999999, "loss": 0.0812, "macro_f1": 0.32380953431129456, "num_tokens": 596666.0, "repeat_count": 0.0, "routers_loss": 0.05065981671214104, "skip_count": 1.0, "step": 408, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 2.384559359067735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.3671875, "learning_rate": 0.0008179999999999999, "loss": 0.0936, "macro_f1": 0.6666666865348816, "num_tokens": 599209.0, "repeat_count": 1.0, "routers_loss": 0.007604009471833706, "skip_count": 0.0, "step": 410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.396212672978878, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.98828125, "learning_rate": 0.0008219999999999999, "loss": 0.0853, "macro_f1": 0.3137255311012268, "num_tokens": 602326.0, "repeat_count": 0.0, "routers_loss": 0.11114615947008133, "skip_count": 4.0, "step": 412, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.407865986890022, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4140625, "learning_rate": 0.000826, "loss": 0.1091, "macro_f1": 0.3137255311012268, "num_tokens": 605544.0, "repeat_count": 0.0, "routers_loss": 0.1738194078207016, "skip_count": 4.0, "step": 414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.4195193008011655, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4453125, "learning_rate": 0.00083, "loss": 0.1067, "macro_f1": 0.3333333432674408, "num_tokens": 608383.0, "repeat_count": 0.0, "routers_loss": 0.008669747039675713, "skip_count": 0.0, "step": 416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.4311726147123087, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.21875, "learning_rate": 0.000834, "loss": 0.0971, "macro_f1": 0.3333333432674408, "num_tokens": 611066.0, "repeat_count": 0.0, "routers_loss": 0.012438272126019001, "skip_count": 0.0, "step": 418, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 2.4428259286234524, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.2265625, "learning_rate": 0.000838, "loss": 0.0964, "macro_f1": 0.5507246255874634, "num_tokens": 613879.0, "repeat_count": 0.0, "routers_loss": 0.049307335168123245, "skip_count": 2.0, "step": 420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.4544792425345956, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9609375, "learning_rate": 0.000842, "loss": 0.0993, "macro_f1": 0.32380953431129456, "num_tokens": 616494.0, "repeat_count": 0.0, "routers_loss": 0.1663498878479004, "skip_count": 2.0, "step": 422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.4661325564457393, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.359375, "learning_rate": 0.000846, "loss": 0.1305, "macro_f1": 0.32380953431129456, "num_tokens": 619170.0, "repeat_count": 0.0, "routers_loss": 0.025030333548784256, "skip_count": 0.0, "step": 424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.477785870356883, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6015625, "learning_rate": 0.00085, "loss": 0.1139, "macro_f1": 0.32380953431129456, "num_tokens": 623050.0, "repeat_count": 0.0, "routers_loss": 0.03259602561593056, "skip_count": 0.0, "step": 426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 2.489439184268026, "f1_execute": 0.8709677457809448, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.8984375, "learning_rate": 0.000854, "loss": 0.1288, "macro_f1": 0.4236559271812439, "num_tokens": 626565.0, "repeat_count": 3.0, "routers_loss": 0.3109710216522217, "skip_count": 3.0, "step": 428, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5234375, "learning_rate": 0.000858, "loss": 0.1051, "macro_f1": 0.3333333432674408, "num_tokens": 631047.0, "repeat_count": 0.0, "routers_loss": 0.004131393972784281, "skip_count": 0.0, "step": 430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.512745812090313, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8515625, "learning_rate": 0.000862, "loss": 0.0735, "macro_f1": 0.32380953431129456, "num_tokens": 633667.0, "repeat_count": 0.0, "routers_loss": 0.051891498267650604, "skip_count": 1.0, "step": 432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.5243991260014567, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3125, "learning_rate": 0.000866, "loss": 0.0804, "macro_f1": 0.32863849401474, "num_tokens": 636205.0, "repeat_count": 0.0, "routers_loss": 0.016651641577482224, "skip_count": 1.0, "step": 434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.5360524399126003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.83203125, "learning_rate": 0.00087, "loss": 0.087, "macro_f1": 0.3333333432674408, "num_tokens": 639067.0, "repeat_count": 0.0, "routers_loss": 0.011886201798915863, "skip_count": 0.0, "step": 436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 2.5477057538237435, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.1640625, "learning_rate": 0.000874, "loss": 0.0594, "macro_f1": 0.48507463932037354, "num_tokens": 641774.0, "repeat_count": 1.0, "routers_loss": 0.19169247150421143, "skip_count": 3.0, "step": 438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.559359067734887, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0, "learning_rate": 0.000878, "loss": 0.0779, "macro_f1": 0.32863849401474, "num_tokens": 644556.0, "repeat_count": 0.0, "routers_loss": 0.022916095331311226, "skip_count": 0.0, "step": 440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.571012381646031, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.765625, "learning_rate": 0.000882, "loss": 0.0843, "macro_f1": 0.32863849401474, "num_tokens": 647197.0, "repeat_count": 0.0, "routers_loss": 0.02645437978208065, "skip_count": 1.0, "step": 442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.582665695557174, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.75, "learning_rate": 0.0008860000000000001, "loss": 0.1065, "macro_f1": 0.32863849401474, "num_tokens": 650268.0, "repeat_count": 0.0, "routers_loss": 0.008186076767742634, "skip_count": 0.0, "step": 444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.5943190094683173, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.92578125, "learning_rate": 0.0008900000000000001, "loss": 0.1018, "macro_f1": 0.3188405930995941, "num_tokens": 652911.0, "repeat_count": 0.0, "routers_loss": 0.036992330104112625, "skip_count": 0.0, "step": 446, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1640625, "learning_rate": 0.000894, "loss": 0.1314, "macro_f1": 0.3333333432674408, "num_tokens": 656277.0, "repeat_count": 1.0, "routers_loss": 0.045055314898490906, "skip_count": 0.0, "step": 448, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.6176256372906046, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.88671875, "learning_rate": 0.000898, "loss": 0.1466, "macro_f1": 0.32380953431129456, "num_tokens": 659080.0, "repeat_count": 1.0, "routers_loss": 0.1658526211977005, "skip_count": 0.0, "step": 450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.629278951201748, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0234375, "learning_rate": 0.000902, "loss": 0.1353, "macro_f1": 0.32380953431129456, "num_tokens": 661677.0, "repeat_count": 1.0, "routers_loss": 0.10925920307636261, "skip_count": 1.0, "step": 452, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.6409322651128915, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.203125, "learning_rate": 0.000906, "loss": 0.1125, "macro_f1": 0.32380953431129456, "num_tokens": 664676.0, "repeat_count": 0.0, "routers_loss": 0.17315880954265594, "skip_count": 1.0, "step": 454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.652585579024035, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6875, "learning_rate": 0.00091, "loss": 0.0595, "macro_f1": 0.3333333432674408, "num_tokens": 667591.0, "repeat_count": 0.0, "routers_loss": 0.012033844366669655, "skip_count": 0.0, "step": 456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.6642388929351783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.75390625, "learning_rate": 0.0009140000000000001, "loss": 0.0748, "macro_f1": 0.3333333432674408, "num_tokens": 670372.0, "repeat_count": 0.0, "routers_loss": 0.0032126549631357193, "skip_count": 0.0, "step": 458, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.675892206846322, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.91796875, "learning_rate": 0.0009180000000000001, "loss": 0.149, "macro_f1": 0.32863849401474, "num_tokens": 673436.0, "repeat_count": 0.0, "routers_loss": 0.04437202587723732, "skip_count": 1.0, "step": 460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.6875455207574657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.97265625, "learning_rate": 0.0009220000000000001, "loss": 0.1557, "macro_f1": 0.3333333432674408, "num_tokens": 676044.0, "repeat_count": 0.0, "routers_loss": 0.00762246735394001, "skip_count": 0.0, "step": 462, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.699198834668609, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1953125, "learning_rate": 0.0009260000000000001, "loss": 0.1153, "macro_f1": 0.32380953431129456, "num_tokens": 678675.0, "repeat_count": 0.0, "routers_loss": 0.024390628561377525, "skip_count": 0.0, "step": 464, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 2.710852148579752, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.78125, "learning_rate": 0.00093, "loss": 0.1105, "macro_f1": 0.3137255012989044, "num_tokens": 681649.0, "repeat_count": 0.0, "routers_loss": 0.12447867542505264, "skip_count": 2.0, "step": 466, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.7225054624908958, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5859375, "learning_rate": 0.000934, "loss": 0.1362, "macro_f1": 0.32863849401474, "num_tokens": 684241.0, "repeat_count": 0.0, "routers_loss": 0.031407374888658524, "skip_count": 0.0, "step": 468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.7341587764020394, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0078125, "learning_rate": 0.0009379999999999999, "loss": 0.1174, "macro_f1": 0.32380953431129456, "num_tokens": 687195.0, "repeat_count": 1.0, "routers_loss": 0.06691675633192062, "skip_count": 1.0, "step": 470, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 2.7458120903131826, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 1.1640625, "learning_rate": 0.000942, "loss": 0.1512, "macro_f1": 0.5898990035057068, "num_tokens": 690132.0, "repeat_count": 1.0, "routers_loss": 0.14061276614665985, "skip_count": 2.0, "step": 472, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 2.7574654042243263, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.1875, "learning_rate": 0.000946, "loss": 0.1186, "macro_f1": 0.48507463932037354, "num_tokens": 693074.0, "repeat_count": 1.0, "routers_loss": 0.18953925371170044, "skip_count": 2.0, "step": 474, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.76911871813547, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2421875, "learning_rate": 0.00095, "loss": 0.1149, "macro_f1": 0.30845773220062256, "num_tokens": 696442.0, "repeat_count": 0.0, "routers_loss": 0.10818091034889221, "skip_count": 4.0, "step": 476, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.780772032046613, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4140625, "learning_rate": 0.000954, "loss": 0.1447, "macro_f1": 0.32380953431129456, "num_tokens": 699366.0, "repeat_count": 2.0, "routers_loss": 0.2320510894060135, "skip_count": 0.0, "step": 478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.792425345957757, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.296875, "learning_rate": 0.000958, "loss": 0.0903, "macro_f1": 0.32863849401474, "num_tokens": 702267.0, "repeat_count": 0.0, "routers_loss": 0.03356971964240074, "skip_count": 1.0, "step": 480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 2.8040786598689005, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.7578125, "learning_rate": 0.000962, "loss": 0.162, "macro_f1": 0.5507246255874634, "num_tokens": 704972.0, "repeat_count": 0.0, "routers_loss": 0.05412694811820984, "skip_count": 2.0, "step": 482, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.8157319737800437, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.796875, "learning_rate": 0.000966, "loss": 0.1296, "macro_f1": 0.3333333432674408, "num_tokens": 707864.0, "repeat_count": 0.0, "routers_loss": 0.00695835379883647, "skip_count": 0.0, "step": 484, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.827385287691187, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1875, "learning_rate": 0.0009699999999999999, "loss": 0.1381, "macro_f1": 0.3188405930995941, "num_tokens": 710371.0, "repeat_count": 0.0, "routers_loss": 0.1008700579404831, "skip_count": 1.0, "step": 486, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 34.0, "epoch": 2.8390386016023306, "f1_execute": 0.9230769872665405, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 0.98046875, "learning_rate": 0.000974, "loss": 0.0889, "macro_f1": 0.41880345344543457, "num_tokens": 712871.0, "repeat_count": 1.0, "routers_loss": 0.12386820465326309, "skip_count": 4.0, "step": 488, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.8506919155134742, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8046875, "learning_rate": 0.000978, "loss": 0.128, "macro_f1": 0.3188405930995941, "num_tokens": 715870.0, "repeat_count": 1.0, "routers_loss": 0.09659242630004883, "skip_count": 1.0, "step": 490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 34.0, "epoch": 2.8623452294246174, "f1_execute": 0.9393939971923828, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 1.15625, "learning_rate": 0.000982, "loss": 0.1164, "macro_f1": 0.42424246668815613, "num_tokens": 718689.0, "repeat_count": 0.0, "routers_loss": 0.2130800038576126, "skip_count": 4.0, "step": 492, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.873998543335761, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.390625, "learning_rate": 0.0009860000000000001, "loss": 0.1549, "macro_f1": 0.32863849401474, "num_tokens": 722291.0, "repeat_count": 0.0, "routers_loss": 0.017502179369330406, "skip_count": 0.0, "step": 494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.8856518572469048, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.98828125, "learning_rate": 0.00099, "loss": 0.0863, "macro_f1": 0.3333333432674408, "num_tokens": 725263.0, "repeat_count": 0.0, "routers_loss": 0.014174620620906353, "skip_count": 0.0, "step": 496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.897305171158048, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4375, "learning_rate": 0.000994, "loss": 0.1354, "macro_f1": 0.32863849401474, "num_tokens": 727977.0, "repeat_count": 0.0, "routers_loss": 0.029278423637151718, "skip_count": 0.0, "step": 498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.9089584850691916, "f1_execute": 0.9090908765792847, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.328125, "learning_rate": 0.000998, "loss": 0.1041, "macro_f1": 0.3030303120613098, "num_tokens": 730815.0, "repeat_count": 1.0, "routers_loss": 0.1902763545513153, "skip_count": 4.0, "step": 500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.9206117989803353, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0703125, "learning_rate": 0.0009999999623929118, "loss": 0.1244, "macro_f1": 0.3333333432674408, "num_tokens": 733392.0, "repeat_count": 0.0, "routers_loss": 0.01464716624468565, "skip_count": 0.0, "step": 502, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.9322651128914785, "f1_execute": 0.9253730773925781, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1640625, "learning_rate": 0.0009999996615362384, "loss": 0.1086, "macro_f1": 0.30845770239830017, "num_tokens": 736032.0, "repeat_count": 1.0, "routers_loss": 0.2324850857257843, "skip_count": 4.0, "step": 504, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 2.9439184268026217, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.000999999059823073, "loss": 0.1348, "macro_f1": 0.3188405930995941, "num_tokens": 738808.0, "repeat_count": 0.0, "routers_loss": 0.03815086930990219, "skip_count": 0.0, "step": 506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 33.0, "epoch": 2.9555717407137654, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.46875, "learning_rate": 0.0009999981572537777, "loss": 0.1001, "macro_f1": 0.4517413079738617, "num_tokens": 741710.0, "repeat_count": 0.0, "routers_loss": 0.1161150261759758, "skip_count": 2.0, "step": 508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 34.0, "epoch": 2.967225054624909, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.7734375, "learning_rate": 0.0009999969538288952, "loss": 0.1376, "macro_f1": 0.4517413079738617, "num_tokens": 744953.0, "repeat_count": 0.0, "routers_loss": 0.30227360129356384, "skip_count": 3.0, "step": 510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 2.9788783685360523, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.125, "learning_rate": 0.00099999544954915, "loss": 0.1349, "macro_f1": 0.30845773220062256, "num_tokens": 748527.0, "repeat_count": 0.0, "routers_loss": 0.37398797273635864, "skip_count": 3.0, "step": 512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 2.990531682447196, "f1_execute": 0.892307698726654, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.484375, "learning_rate": 0.0009999936444154468, "loss": 0.159, "macro_f1": 0.2974359095096588, "num_tokens": 751260.0, "repeat_count": 6.0, "routers_loss": 0.5735613703727722, "skip_count": 1.0, "step": 514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3046875, "learning_rate": 0.0009999915384288722, "loss": 0.0995, "macro_f1": 0.3333333432674408, "num_tokens": 753648.0, "repeat_count": 0.0, "routers_loss": 0.007424095645546913, "skip_count": 0.0, "step": 516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.0116533139111437, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.77734375, "learning_rate": 0.000999989131590693, "loss": 0.0437, "macro_f1": 0.32380953431129456, "num_tokens": 756608.0, "repeat_count": 1.0, "routers_loss": 0.04280600696802139, "skip_count": 0.0, "step": 518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 32.0, "epoch": 3.023306627822287, "f1_execute": 0.9677419066429138, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 1.1640625, "learning_rate": 0.000999986423902358, "loss": 0.0809, "macro_f1": 0.6188769340515137, "num_tokens": 760394.0, "repeat_count": 1.0, "routers_loss": 0.07915312051773071, "skip_count": 5.0, "step": 520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.0349599417334305, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6953125, "learning_rate": 0.0009999834153654958, "loss": 0.0523, "macro_f1": 0.3333333432674408, "num_tokens": 763350.0, "repeat_count": 0.0, "routers_loss": 0.009005132131278515, "skip_count": 0.0, "step": 522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.0466132556445737, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4296875, "learning_rate": 0.0009999801059819172, "loss": 0.1002, "macro_f1": 0.32863849401474, "num_tokens": 765570.0, "repeat_count": 0.0, "routers_loss": 0.02528059482574463, "skip_count": 0.0, "step": 524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.0582665695557174, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.89453125, "learning_rate": 0.0009999764957536132, "loss": 0.0612, "macro_f1": 0.3137255311012268, "num_tokens": 770066.0, "repeat_count": 1.0, "routers_loss": 0.07426194101572037, "skip_count": 2.0, "step": 526, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.069919883466861, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1328125, "learning_rate": 0.000999972584682756, "loss": 0.094, "macro_f1": 0.30845773220062256, "num_tokens": 773348.0, "repeat_count": 1.0, "routers_loss": 0.2176111787557602, "skip_count": 3.0, "step": 528, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 35.0, "epoch": 3.0815731973780043, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.4296875, "learning_rate": 0.0009999683727716995, "loss": 0.1094, "macro_f1": 0.44102567434310913, "num_tokens": 775837.0, "repeat_count": 2.0, "routers_loss": 0.147159144282341, "skip_count": 4.0, "step": 530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 33.0, "epoch": 3.093226511289148, "f1_execute": 0.9538460969924927, "f1_repeat": 0.0, "f1_skip": 0.5714285373687744, "grad_norm": 0.92578125, "learning_rate": 0.0009999638600229775, "loss": 0.0769, "macro_f1": 0.5084248781204224, "num_tokens": 778944.0, "repeat_count": 0.0, "routers_loss": 0.08577541261911392, "skip_count": 4.0, "step": 532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.104879825200291, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0, "learning_rate": 0.0009999590464393057, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 781353.0, "repeat_count": 0.0, "routers_loss": 0.0211736261844635, "skip_count": 0.0, "step": 534, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.116533139111435, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.66015625, "learning_rate": 0.00099995393202358, "loss": 0.0491, "macro_f1": 0.3333333432674408, "num_tokens": 784230.0, "repeat_count": 0.0, "routers_loss": 0.010111500509083271, "skip_count": 0.0, "step": 536, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.1281864530225785, "f1_execute": 0.9552239179611206, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.703125, "learning_rate": 0.0009999485167788789, "loss": 0.048, "macro_f1": 0.6517413258552551, "num_tokens": 787014.0, "repeat_count": 1.0, "routers_loss": 0.0514383539557457, "skip_count": 1.0, "step": 538, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.1398397669337217, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.546875, "learning_rate": 0.0009999428007084596, "loss": 0.0507, "macro_f1": 0.3333333432674408, "num_tokens": 789431.0, "repeat_count": 0.0, "routers_loss": 0.006390048190951347, "skip_count": 0.0, "step": 540, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.1514930808448653, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.796875, "learning_rate": 0.0009999367838157622, "loss": 0.0503, "macro_f1": 0.32863849401474, "num_tokens": 792909.0, "repeat_count": 0.0, "routers_loss": 0.04733205586671829, "skip_count": 1.0, "step": 542, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.1631463947560086, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.203125, "learning_rate": 0.000999930466104407, "loss": 0.1115, "macro_f1": 0.32380953431129456, "num_tokens": 795908.0, "repeat_count": 0.0, "routers_loss": 0.06034643203020096, "skip_count": 2.0, "step": 544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.174799708667152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.97265625, "learning_rate": 0.0009999238475781956, "loss": 0.0896, "macro_f1": 0.3333333432674408, "num_tokens": 798778.0, "repeat_count": 0.0, "routers_loss": 0.012119412422180176, "skip_count": 0.0, "step": 546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.186453022578296, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.93359375, "learning_rate": 0.0009999169282411103, "loss": 0.1001, "macro_f1": 0.32863849401474, "num_tokens": 801877.0, "repeat_count": 0.0, "routers_loss": 0.018893994390964508, "skip_count": 0.0, "step": 548, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.198106336489439, "f1_execute": 0.9253731369972229, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5546875, "learning_rate": 0.0009999097080973145, "loss": 0.0939, "macro_f1": 0.30845773220062256, "num_tokens": 804800.0, "repeat_count": 1.0, "routers_loss": 0.17516465485095978, "skip_count": 2.0, "step": 550, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 3.2097596504005828, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.203125, "learning_rate": 0.0009999021871511526, "loss": 0.1033, "macro_f1": 0.5507246255874634, "num_tokens": 807924.0, "repeat_count": 0.0, "routers_loss": 0.045563552528619766, "skip_count": 2.0, "step": 552, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.221412964311726, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0859375, "learning_rate": 0.0009998943654071504, "loss": 0.0841, "macro_f1": 0.32863849401474, "num_tokens": 810392.0, "repeat_count": 1.0, "routers_loss": 0.012307362630963326, "skip_count": 0.0, "step": 554, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.2330662782228696, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0625, "learning_rate": 0.000999886242870014, "loss": 0.1182, "macro_f1": 0.32863849401474, "num_tokens": 812710.0, "repeat_count": 0.0, "routers_loss": 0.01186416856944561, "skip_count": 0.0, "step": 556, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.2447195921340133, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.9921875, "learning_rate": 0.000999877819544631, "loss": 0.0694, "macro_f1": 0.661835789680481, "num_tokens": 815446.0, "repeat_count": 1.0, "routers_loss": 0.024565091356635094, "skip_count": 0.0, "step": 558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.2563729060451565, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9609375, "learning_rate": 0.0009998690954360699, "loss": 0.0922, "macro_f1": 0.32380953431129456, "num_tokens": 818196.0, "repeat_count": 0.0, "routers_loss": 0.131768137216568, "skip_count": 1.0, "step": 560, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0234375, "learning_rate": 0.00099986007054958, "loss": 0.1028, "macro_f1": 0.3333333432674408, "num_tokens": 821116.0, "repeat_count": 0.0, "routers_loss": 0.005216129124164581, "skip_count": 0.0, "step": 562, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 32.0, "epoch": 3.2796795338674434, "f1_execute": 0.9375, "f1_repeat": 0.0, "f1_skip": 0.5714285373687744, "grad_norm": 0.921875, "learning_rate": 0.0009998507448905915, "loss": 0.0797, "macro_f1": 0.5029761791229248, "num_tokens": 824135.0, "repeat_count": 1.0, "routers_loss": 0.13556139171123505, "skip_count": 3.0, "step": 564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.291332847778587, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.296875, "learning_rate": 0.0009998411184647163, "loss": 0.0964, "macro_f1": 0.32863849401474, "num_tokens": 827516.0, "repeat_count": 0.0, "routers_loss": 0.032772112637758255, "skip_count": 1.0, "step": 566, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 3.3029861616897307, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.1640625, "learning_rate": 0.0009998311912777463, "loss": 0.0584, "macro_f1": 0.5406302213668823, "num_tokens": 830815.0, "repeat_count": 0.0, "routers_loss": 0.03802331164479256, "skip_count": 2.0, "step": 568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.314639475600874, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.64453125, "learning_rate": 0.0009998209633356547, "loss": 0.0462, "macro_f1": 0.3188405930995941, "num_tokens": 833883.0, "repeat_count": 0.0, "routers_loss": 0.15256841480731964, "skip_count": 2.0, "step": 570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.3262927895120176, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8203125, "learning_rate": 0.0009998104346445964, "loss": 0.0699, "macro_f1": 0.32380953431129456, "num_tokens": 836858.0, "repeat_count": 1.0, "routers_loss": 0.04820551723241806, "skip_count": 0.0, "step": 572, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.337946103423161, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7890625, "learning_rate": 0.0009997996052109061, "loss": 0.0738, "macro_f1": 0.3137255311012268, "num_tokens": 839982.0, "repeat_count": 0.0, "routers_loss": 0.5754324197769165, "skip_count": 4.0, "step": 574, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 3.3495994173343044, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.0390625, "learning_rate": 0.0009997884750411004, "loss": 0.0829, "macro_f1": 0.4901960790157318, "num_tokens": 842797.0, "repeat_count": 0.0, "routers_loss": 0.11119719594717026, "skip_count": 2.0, "step": 576, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 3.361252731245448, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.63671875, "learning_rate": 0.000999777044141876, "loss": 0.0676, "macro_f1": 0.32380953431129456, "num_tokens": 846551.0, "repeat_count": 0.0, "routers_loss": 0.13672441244125366, "skip_count": 1.0, "step": 578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.3729060451565913, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.90625, "learning_rate": 0.0009997653125201117, "loss": 0.2973, "macro_f1": 0.32380953431129456, "num_tokens": 848997.0, "repeat_count": 0.0, "routers_loss": 0.04769091680645943, "skip_count": 2.0, "step": 580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.384559359067735, "f1_execute": 0.8923077583312988, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0703125, "learning_rate": 0.0009997532801828658, "loss": 0.0961, "macro_f1": 0.2974359393119812, "num_tokens": 851921.0, "repeat_count": 1.0, "routers_loss": 0.22279596328735352, "skip_count": 4.0, "step": 582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.396212672978878, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3515625, "learning_rate": 0.0009997409471373788, "loss": 0.0803, "macro_f1": 0.3137255012989044, "num_tokens": 854488.0, "repeat_count": 0.0, "routers_loss": 0.10167410224676132, "skip_count": 2.0, "step": 584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 3.407865986890022, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.7578125, "learning_rate": 0.0009997283133910716, "loss": 0.0819, "macro_f1": 0.48507463932037354, "num_tokens": 857244.0, "repeat_count": 0.0, "routers_loss": 0.09702060371637344, "skip_count": 3.0, "step": 586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.4195193008011655, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.94921875, "learning_rate": 0.0009997153789515461, "loss": 0.0687, "macro_f1": 0.3333333432674408, "num_tokens": 859663.0, "repeat_count": 0.0, "routers_loss": 0.00417871680110693, "skip_count": 0.0, "step": 588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 3.4311726147123087, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.140625, "learning_rate": 0.0009997021438265853, "loss": 0.1165, "macro_f1": 0.4901960790157318, "num_tokens": 862273.0, "repeat_count": 0.0, "routers_loss": 0.08385443687438965, "skip_count": 3.0, "step": 590, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 33.0, "epoch": 3.4428259286234524, "f1_execute": 0.939393937587738, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 0.8984375, "learning_rate": 0.0009996886080241524, "loss": 0.0717, "macro_f1": 0.42424243688583374, "num_tokens": 866088.0, "repeat_count": 0.0, "routers_loss": 0.10002566874027252, "skip_count": 3.0, "step": 592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 3.4544792425345956, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.0078125, "learning_rate": 0.0009996747715523924, "loss": 0.1095, "macro_f1": 0.5507246255874634, "num_tokens": 868940.0, "repeat_count": 0.0, "routers_loss": 0.08483508974313736, "skip_count": 2.0, "step": 594, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 3.4661325564457393, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.79296875, "learning_rate": 0.000999660634419631, "loss": 0.059, "macro_f1": 0.4901960790157318, "num_tokens": 871961.0, "repeat_count": 0.0, "routers_loss": 0.10679342597723007, "skip_count": 3.0, "step": 596, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.477785870356883, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.0009996461966343747, "loss": 0.0922, "macro_f1": 0.32863849401474, "num_tokens": 875204.0, "repeat_count": 1.0, "routers_loss": 0.06789799779653549, "skip_count": 0.0, "step": 598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 3.489439184268026, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.7890625, "learning_rate": 0.0009996314582053105, "loss": 0.1127, "macro_f1": 0.5507246255874634, "num_tokens": 877794.0, "repeat_count": 0.0, "routers_loss": 0.057570651173591614, "skip_count": 1.0, "step": 600, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 3.50109249817917, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.1953125, "learning_rate": 0.0009996164191413072, "loss": 0.1007, "macro_f1": 0.5507246255874634, "num_tokens": 880293.0, "repeat_count": 0.0, "routers_loss": 0.06415726244449615, "skip_count": 2.0, "step": 602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 3.512745812090313, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0546875, "learning_rate": 0.000999601079451414, "loss": 0.073, "macro_f1": 0.3137255311012268, "num_tokens": 883234.0, "repeat_count": 1.0, "routers_loss": 0.06647336483001709, "skip_count": 0.0, "step": 604, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.5243991260014567, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.76953125, "learning_rate": 0.0009995854391448608, "loss": 0.0665, "macro_f1": 0.32863849401474, "num_tokens": 885843.0, "repeat_count": 0.0, "routers_loss": 0.025467203930020332, "skip_count": 0.0, "step": 606, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.5360524399126003, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.61328125, "learning_rate": 0.0009995694982310583, "loss": 0.0831, "macro_f1": 0.3333333432674408, "num_tokens": 888689.0, "repeat_count": 0.0, "routers_loss": 0.002837743144482374, "skip_count": 0.0, "step": 608, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.5477057538237435, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5703125, "learning_rate": 0.000999553256719599, "loss": 0.1139, "macro_f1": 0.3137255311012268, "num_tokens": 892562.0, "repeat_count": 1.0, "routers_loss": 0.12482861429452896, "skip_count": 3.0, "step": 610, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 34.0, "epoch": 3.559359067734887, "f1_execute": 0.8852458596229553, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 0.90234375, "learning_rate": 0.000999536714620255, "loss": 0.0852, "macro_f1": 0.44323012232780457, "num_tokens": 895017.0, "repeat_count": 1.0, "routers_loss": 0.3351808190345764, "skip_count": 6.0, "step": 612, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 3.571012381646031, "f1_execute": 0.9230769872665405, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.87890625, "learning_rate": 0.0009995198719429804, "loss": 0.08, "macro_f1": 0.47435900568962097, "num_tokens": 897753.0, "repeat_count": 2.0, "routers_loss": 0.2776757478713989, "skip_count": 3.0, "step": 614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.582665695557174, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2578125, "learning_rate": 0.0009995027286979093, "loss": 0.1062, "macro_f1": 0.32380953431129456, "num_tokens": 901033.0, "repeat_count": 0.0, "routers_loss": 0.1498979777097702, "skip_count": 1.0, "step": 616, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 3.5943190094683173, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.98828125, "learning_rate": 0.0009994852848953573, "loss": 0.0726, "macro_f1": 0.4901960790157318, "num_tokens": 904216.0, "repeat_count": 0.0, "routers_loss": 0.13804541528224945, "skip_count": 2.0, "step": 618, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.605972323379461, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8046875, "learning_rate": 0.0009994675405458205, "loss": 0.0668, "macro_f1": 0.3137255012989044, "num_tokens": 906818.0, "repeat_count": 1.0, "routers_loss": 0.08404725044965744, "skip_count": 1.0, "step": 620, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.6176256372906046, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3359375, "learning_rate": 0.0009994494956599759, "loss": 0.0696, "macro_f1": 0.3188405930995941, "num_tokens": 909629.0, "repeat_count": 1.0, "routers_loss": 0.20976489782333374, "skip_count": 1.0, "step": 622, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9375, "learning_rate": 0.0009994311502486812, "loss": 0.1036, "macro_f1": 0.3333333432674408, "num_tokens": 912553.0, "repeat_count": 0.0, "routers_loss": 0.007379824295639992, "skip_count": 0.0, "step": 624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.6409322651128915, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1640625, "learning_rate": 0.0009994125043229752, "loss": 0.099, "macro_f1": 0.32863849401474, "num_tokens": 915235.0, "repeat_count": 0.0, "routers_loss": 0.033134542405605316, "skip_count": 0.0, "step": 626, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.652585579024035, "f1_execute": 0.9552239179611206, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 0.0009993935578940774, "loss": 0.0981, "macro_f1": 0.5406302213668823, "num_tokens": 918709.0, "repeat_count": 2.0, "routers_loss": 0.11185518652200699, "skip_count": 0.0, "step": 628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.6642388929351783, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.64453125, "learning_rate": 0.0009993743109733882, "loss": 0.1043, "macro_f1": 0.32863849401474, "num_tokens": 921668.0, "repeat_count": 0.0, "routers_loss": 0.012559465132653713, "skip_count": 0.0, "step": 630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.675892206846322, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.94140625, "learning_rate": 0.0009993547635724887, "loss": 0.0519, "macro_f1": 0.3188405930995941, "num_tokens": 924674.0, "repeat_count": 0.0, "routers_loss": 0.06662051379680634, "skip_count": 1.0, "step": 632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.6875455207574657, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.57421875, "learning_rate": 0.0009993349157031406, "loss": 0.068, "macro_f1": 0.3333333432674408, "num_tokens": 927718.0, "repeat_count": 0.0, "routers_loss": 0.003487096866592765, "skip_count": 0.0, "step": 634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 33.0, "epoch": 3.699198834668609, "f1_execute": 0.939393937587738, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 0.734375, "learning_rate": 0.0009993147673772868, "loss": 0.1495, "macro_f1": 0.42424243688583374, "num_tokens": 930151.0, "repeat_count": 0.0, "routers_loss": 0.0866234079003334, "skip_count": 3.0, "step": 636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.710852148579752, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0, "learning_rate": 0.000999294318607051, "loss": 0.0863, "macro_f1": 0.32380953431129456, "num_tokens": 933189.0, "repeat_count": 1.0, "routers_loss": 0.03790837526321411, "skip_count": 1.0, "step": 638, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.7225054624908958, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3359375, "learning_rate": 0.0009992735694047373, "loss": 0.0796, "macro_f1": 0.3188405930995941, "num_tokens": 935934.0, "repeat_count": 0.0, "routers_loss": 0.08045468479394913, "skip_count": 2.0, "step": 640, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.7341587764020394, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.72265625, "learning_rate": 0.0009992525197828309, "loss": 0.0652, "macro_f1": 0.3333333432674408, "num_tokens": 939196.0, "repeat_count": 0.0, "routers_loss": 0.011753711849451065, "skip_count": 0.0, "step": 642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 3.7458120903131826, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.68359375, "learning_rate": 0.0009992311697539973, "loss": 0.0866, "macro_f1": 0.4901960790157318, "num_tokens": 941982.0, "repeat_count": 0.0, "routers_loss": 0.1489081233739853, "skip_count": 3.0, "step": 644, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.7574654042243263, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5625, "learning_rate": 0.0009992095193310836, "loss": 0.1205, "macro_f1": 0.3333333432674408, "num_tokens": 944639.0, "repeat_count": 0.0, "routers_loss": 0.0054289051331579685, "skip_count": 0.0, "step": 646, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 3.76911871813547, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.77734375, "learning_rate": 0.0009991875685271168, "loss": 0.0866, "macro_f1": 0.4901960790157318, "num_tokens": 948407.0, "repeat_count": 0.0, "routers_loss": 0.03848421946167946, "skip_count": 2.0, "step": 648, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8984375, "learning_rate": 0.000999165317355305, "loss": 0.0642, "macro_f1": 0.3333333432674408, "num_tokens": 950908.0, "repeat_count": 0.0, "routers_loss": 0.00852389819920063, "skip_count": 0.0, "step": 650, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 3.792425345957757, "f1_execute": 0.9375, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.86328125, "learning_rate": 0.000999142765829037, "loss": 0.1001, "macro_f1": 0.5347222685813904, "num_tokens": 953697.0, "repeat_count": 1.0, "routers_loss": 0.08259768038988113, "skip_count": 4.0, "step": 652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.8040786598689005, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.82421875, "learning_rate": 0.0009991199139618827, "loss": 0.0824, "macro_f1": 0.32380953431129456, "num_tokens": 956638.0, "repeat_count": 0.0, "routers_loss": 0.12160049378871918, "skip_count": 2.0, "step": 654, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 3.8157319737800437, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.94140625, "learning_rate": 0.000999096761767592, "loss": 0.1118, "macro_f1": 0.4743589758872986, "num_tokens": 958993.0, "repeat_count": 3.0, "routers_loss": 0.5869073867797852, "skip_count": 3.0, "step": 656, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.827385287691187, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0546875, "learning_rate": 0.0009990733092600961, "loss": 0.055, "macro_f1": 0.32380953431129456, "num_tokens": 961533.0, "repeat_count": 0.0, "routers_loss": 0.06475231796503067, "skip_count": 2.0, "step": 658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.8390386016023306, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0703125, "learning_rate": 0.0009990495564535064, "loss": 0.0814, "macro_f1": 0.32380953431129456, "num_tokens": 964967.0, "repeat_count": 1.0, "routers_loss": 0.6450141072273254, "skip_count": 1.0, "step": 660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.8506919155134742, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.578125, "learning_rate": 0.0009990255033621158, "loss": 0.09, "macro_f1": 0.32863849401474, "num_tokens": 967952.0, "repeat_count": 1.0, "routers_loss": 0.3160175383090973, "skip_count": 0.0, "step": 662, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.8623452294246174, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6640625, "learning_rate": 0.000999001150000397, "loss": 0.0908, "macro_f1": 0.32863849401474, "num_tokens": 970847.0, "repeat_count": 0.0, "routers_loss": 0.015495280735194683, "skip_count": 0.0, "step": 664, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.873998543335761, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.875, "learning_rate": 0.0009989764963830037, "loss": 0.07, "macro_f1": 0.32863849401474, "num_tokens": 973517.0, "repeat_count": 0.0, "routers_loss": 0.0958336740732193, "skip_count": 1.0, "step": 666, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.8856518572469048, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.875, "learning_rate": 0.0009989515425247704, "loss": 0.096, "macro_f1": 0.32380953431129456, "num_tokens": 977322.0, "repeat_count": 1.0, "routers_loss": 0.14332996308803558, "skip_count": 1.0, "step": 668, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.897305171158048, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.75390625, "learning_rate": 0.0009989262884407125, "loss": 0.0977, "macro_f1": 0.32380953431129456, "num_tokens": 980170.0, "repeat_count": 0.0, "routers_loss": 0.046958889812231064, "skip_count": 1.0, "step": 670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 3.9089584850691916, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.046875, "learning_rate": 0.000998900734146025, "loss": 0.0783, "macro_f1": 0.545751690864563, "num_tokens": 983056.0, "repeat_count": 0.0, "routers_loss": 0.041316110640764236, "skip_count": 1.0, "step": 672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 33.0, "epoch": 3.9206117989803353, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.2857142686843872, "grad_norm": 1.03125, "learning_rate": 0.000998874879656085, "loss": 0.0726, "macro_f1": 0.40293043851852417, "num_tokens": 985663.0, "repeat_count": 0.0, "routers_loss": 0.14400599896907806, "skip_count": 4.0, "step": 674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.9322651128914785, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.140625, "learning_rate": 0.0009988487249864488, "loss": 0.0776, "macro_f1": 0.3137255311012268, "num_tokens": 988303.0, "repeat_count": 1.0, "routers_loss": 0.20256958901882172, "skip_count": 3.0, "step": 676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 3.9439184268026217, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.84375, "learning_rate": 0.0009988222701528546, "loss": 0.0653, "macro_f1": 0.32380953431129456, "num_tokens": 991070.0, "repeat_count": 0.0, "routers_loss": 0.055199433118104935, "skip_count": 0.0, "step": 678, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.9555717407137654, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9765625, "learning_rate": 0.0009987955151712204, "loss": 0.1242, "macro_f1": 0.3333333432674408, "num_tokens": 994177.0, "repeat_count": 0.0, "routers_loss": 0.011528365314006805, "skip_count": 0.0, "step": 680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.967225054624909, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.87890625, "learning_rate": 0.000998768460057645, "loss": 0.0575, "macro_f1": 0.3333333432674408, "num_tokens": 996612.0, "repeat_count": 0.0, "routers_loss": 0.009386923164129257, "skip_count": 0.0, "step": 682, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 3.9788783685360523, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.15625, "learning_rate": 0.000998741104828408, "loss": 0.0969, "macro_f1": 0.3188405930995941, "num_tokens": 999032.0, "repeat_count": 2.0, "routers_loss": 0.24700556695461273, "skip_count": 1.0, "step": 684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 3.990531682447196, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.09375, "learning_rate": 0.000998713449499969, "loss": 0.0812, "macro_f1": 0.32380953431129456, "num_tokens": 1002366.0, "repeat_count": 1.0, "routers_loss": 0.47267431020736694, "skip_count": 0.0, "step": 686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.0, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.58984375, "learning_rate": 0.0009986854940889692, "loss": 0.0586, "macro_f1": 0.32863849401474, "num_tokens": 1004864.0, "repeat_count": 0.0, "routers_loss": 0.013957676477730274, "skip_count": 0.0, "step": 688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.734375, "learning_rate": 0.000998657238612229, "loss": 0.0813, "macro_f1": 0.3333333432674408, "num_tokens": 1006765.0, "repeat_count": 0.0, "routers_loss": 0.007732318714261055, "skip_count": 0.0, "step": 690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.023306627822287, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6640625, "learning_rate": 0.000998628683086751, "loss": 0.0893, "macro_f1": 0.32380953431129456, "num_tokens": 1009590.0, "repeat_count": 0.0, "routers_loss": 0.04803141579031944, "skip_count": 1.0, "step": 692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.0349599417334305, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.046875, "learning_rate": 0.0009985998275297164, "loss": 0.0721, "macro_f1": 0.3333333432674408, "num_tokens": 1012215.0, "repeat_count": 0.0, "routers_loss": 0.005472803488373756, "skip_count": 0.0, "step": 694, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.046613255644574, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.953125, "learning_rate": 0.0009985706719584887, "loss": 0.0757, "macro_f1": 0.32863849401474, "num_tokens": 1014406.0, "repeat_count": 0.0, "routers_loss": 0.014061192981898785, "skip_count": 0.0, "step": 696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.058266569555718, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3359375, "learning_rate": 0.000998541216390611, "loss": 0.0598, "macro_f1": 0.32863849401474, "num_tokens": 1016936.0, "repeat_count": 0.0, "routers_loss": 0.020411675795912743, "skip_count": 0.0, "step": 698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.069919883466861, "f1_execute": 0.920634925365448, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.0703125, "learning_rate": 0.0009985114608438072, "loss": 0.0605, "macro_f1": 0.5291005373001099, "num_tokens": 1020064.0, "repeat_count": 2.0, "routers_loss": 0.16979196667671204, "skip_count": 4.0, "step": 700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0234375, "learning_rate": 0.0009984814053359814, "loss": 0.0486, "macro_f1": 0.3333333432674408, "num_tokens": 1022922.0, "repeat_count": 0.0, "routers_loss": 0.003148297080770135, "skip_count": 0.0, "step": 702, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 4.0932265112891475, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.58203125, "learning_rate": 0.0009984510498852188, "loss": 0.0406, "macro_f1": 0.6666666865348816, "num_tokens": 1026412.0, "repeat_count": 0.0, "routers_loss": 0.009915413334965706, "skip_count": 2.0, "step": 704, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.104879825200292, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.828125, "learning_rate": 0.0009984203945097843, "loss": 0.0479, "macro_f1": 0.661835789680481, "num_tokens": 1029386.0, "repeat_count": 1.0, "routers_loss": 0.015055741183459759, "skip_count": 1.0, "step": 706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 4.116533139111435, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8515625, "learning_rate": 0.0009983894392281236, "loss": 0.0585, "macro_f1": 0.3137255012989044, "num_tokens": 1031714.0, "repeat_count": 0.0, "routers_loss": 0.16579927504062653, "skip_count": 2.0, "step": 708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 4.128186453022578, "f1_execute": 0.9523809552192688, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.62890625, "learning_rate": 0.0009983581840588632, "loss": 0.0734, "macro_f1": 0.4841269850730896, "num_tokens": 1035285.0, "repeat_count": 0.0, "routers_loss": 0.08674705773591995, "skip_count": 6.0, "step": 710, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.139839766933722, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.765625, "learning_rate": 0.0009983266290208095, "loss": 0.0504, "macro_f1": 0.32380953431129456, "num_tokens": 1038166.0, "repeat_count": 0.0, "routers_loss": 0.06395600736141205, "skip_count": 1.0, "step": 712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.151493080844865, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.69921875, "learning_rate": 0.0009982947741329498, "loss": 0.0776, "macro_f1": 0.32863849401474, "num_tokens": 1040777.0, "repeat_count": 1.0, "routers_loss": 0.015867259353399277, "skip_count": 0.0, "step": 714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 4.163146394756009, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.466796875, "learning_rate": 0.0009982626194144516, "loss": 0.0509, "macro_f1": 0.661835789680481, "num_tokens": 1044244.0, "repeat_count": 1.0, "routers_loss": 0.046437155455350876, "skip_count": 1.0, "step": 716, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 4.174799708667152, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.546875, "learning_rate": 0.0009982301648846627, "loss": 0.0373, "macro_f1": 0.5950249433517456, "num_tokens": 1046501.0, "repeat_count": 0.0, "routers_loss": 0.018194062635302544, "skip_count": 3.0, "step": 718, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.186453022578296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.87890625, "learning_rate": 0.0009981974105631114, "loss": 0.0765, "macro_f1": 0.3333333432674408, "num_tokens": 1049061.0, "repeat_count": 0.0, "routers_loss": 0.007901065051555634, "skip_count": 0.0, "step": 720, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.198106336489439, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.61328125, "learning_rate": 0.0009981643564695066, "loss": 0.0565, "macro_f1": 0.3333333432674408, "num_tokens": 1052113.0, "repeat_count": 0.0, "routers_loss": 0.008173886686563492, "skip_count": 0.0, "step": 722, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.209759650400582, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.81640625, "learning_rate": 0.0009981310026237371, "loss": 0.055, "macro_f1": 0.32863849401474, "num_tokens": 1054849.0, "repeat_count": 0.0, "routers_loss": 0.018083596602082253, "skip_count": 0.0, "step": 724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.221412964311726, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.49609375, "learning_rate": 0.0009980973490458728, "loss": 0.0395, "macro_f1": 0.3333333432674408, "num_tokens": 1058111.0, "repeat_count": 0.0, "routers_loss": 0.015131649561226368, "skip_count": 0.0, "step": 726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.23306627822287, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.423828125, "learning_rate": 0.000998063395756163, "loss": 0.0666, "macro_f1": 0.545751690864563, "num_tokens": 1062242.0, "repeat_count": 1.0, "routers_loss": 0.3139135539531708, "skip_count": 2.0, "step": 728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.63671875, "learning_rate": 0.0009980291427750382, "loss": 0.0631, "macro_f1": 0.3333333432674408, "num_tokens": 1065098.0, "repeat_count": 0.0, "routers_loss": 0.009496678598225117, "skip_count": 0.0, "step": 730, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.256372906045157, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.640625, "learning_rate": 0.000997994590123109, "loss": 0.0373, "macro_f1": 0.32380953431129456, "num_tokens": 1067899.0, "repeat_count": 0.0, "routers_loss": 0.03721831366419792, "skip_count": 2.0, "step": 732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 4.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.6875, "learning_rate": 0.0009979597378211656, "loss": 0.054, "macro_f1": 0.6666666865348816, "num_tokens": 1070946.0, "repeat_count": 0.0, "routers_loss": 0.006780209485441446, "skip_count": 1.0, "step": 734, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.279679533867443, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7109375, "learning_rate": 0.0009979245858901795, "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 1073414.0, "repeat_count": 0.0, "routers_loss": 0.0023067891597747803, "skip_count": 0.0, "step": 736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.2913328477785875, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.796875, "learning_rate": 0.0009978891343513023, "loss": 0.0677, "macro_f1": 0.32380953431129456, "num_tokens": 1076245.0, "repeat_count": 1.0, "routers_loss": 0.10942371934652328, "skip_count": 0.0, "step": 738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.302986161689731, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.98046875, "learning_rate": 0.0009978533832258653, "loss": 0.0608, "macro_f1": 0.32863849401474, "num_tokens": 1078912.0, "repeat_count": 0.0, "routers_loss": 0.020231494680047035, "skip_count": 1.0, "step": 740, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 4.314639475600874, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.64453125, "learning_rate": 0.0009978173325353803, "loss": 0.0488, "macro_f1": 0.545751690864563, "num_tokens": 1081659.0, "repeat_count": 1.0, "routers_loss": 0.021388640627264977, "skip_count": 1.0, "step": 742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.326292789512017, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4375, "learning_rate": 0.00099778098230154, "loss": 0.0548, "macro_f1": 0.32863849401474, "num_tokens": 1084800.0, "repeat_count": 0.0, "routers_loss": 0.018617408350110054, "skip_count": 0.0, "step": 744, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.337946103423161, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.61328125, "learning_rate": 0.0009977443325462165, "loss": 0.0554, "macro_f1": 0.3333333432674408, "num_tokens": 1087758.0, "repeat_count": 0.0, "routers_loss": 0.012268011458218098, "skip_count": 0.0, "step": 746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.546875, "learning_rate": 0.0009977073832914624, "loss": 0.0397, "macro_f1": 0.3333333432674408, "num_tokens": 1091219.0, "repeat_count": 0.0, "routers_loss": 0.0047331154346466064, "skip_count": 0.0, "step": 748, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.361252731245448, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62890625, "learning_rate": 0.0009976701345595109, "loss": 0.0511, "macro_f1": 0.32863849401474, "num_tokens": 1094236.0, "repeat_count": 1.0, "routers_loss": 0.07411985844373703, "skip_count": 0.0, "step": 750, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 4.372906045156592, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.5625, "learning_rate": 0.0009976325863727746, "loss": 0.0585, "macro_f1": 0.4901960790157318, "num_tokens": 1097245.0, "repeat_count": 0.0, "routers_loss": 0.05602121353149414, "skip_count": 3.0, "step": 752, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.384559359067735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.376953125, "learning_rate": 0.000997594738753847, "loss": 0.0384, "macro_f1": 0.6666666865348816, "num_tokens": 1100105.0, "repeat_count": 1.0, "routers_loss": 0.001379751367494464, "skip_count": 0.0, "step": 754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.396212672978878, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.77734375, "learning_rate": 0.0009975565917255016, "loss": 0.0459, "macro_f1": 0.32380953431129456, "num_tokens": 1102878.0, "repeat_count": 1.0, "routers_loss": 0.09103039652109146, "skip_count": 0.0, "step": 756, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.407865986890021, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.45703125, "learning_rate": 0.0009975181453106918, "loss": 0.0386, "macro_f1": 0.5507246255874634, "num_tokens": 1106136.0, "repeat_count": 0.0, "routers_loss": 0.03748704865574837, "skip_count": 2.0, "step": 758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.4195193008011655, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.84375, "learning_rate": 0.0009974793995325513, "loss": 0.066, "macro_f1": 0.32380953431129456, "num_tokens": 1108819.0, "repeat_count": 1.0, "routers_loss": 0.11070743948221207, "skip_count": 1.0, "step": 760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.431172614712309, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.52734375, "learning_rate": 0.0009974403544143941, "loss": 0.0676, "macro_f1": 0.545751690864563, "num_tokens": 1111486.0, "repeat_count": 1.0, "routers_loss": 0.11156581342220306, "skip_count": 2.0, "step": 762, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.20000000298023224, "avg_layers": 37.0, "epoch": 4.442825928623452, "f1_execute": 0.9354838728904724, "f1_repeat": 1.0, "f1_skip": 0.3333333134651184, "grad_norm": 0.396484375, "learning_rate": 0.000997401009979714, "loss": 0.0471, "macro_f1": 0.7562724351882935, "num_tokens": 1114618.0, "repeat_count": 2.0, "routers_loss": 0.09396973252296448, "skip_count": 5.0, "step": 764, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.454479242534596, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.81640625, "learning_rate": 0.0009973613662521855, "loss": 0.062, "macro_f1": 0.545751690864563, "num_tokens": 1117217.0, "repeat_count": 1.0, "routers_loss": 0.0970328152179718, "skip_count": 2.0, "step": 766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.466132556445739, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.68359375, "learning_rate": 0.0009973214232556623, "loss": 0.0519, "macro_f1": 0.5507246255874634, "num_tokens": 1120667.0, "repeat_count": 0.0, "routers_loss": 0.03365720063447952, "skip_count": 2.0, "step": 768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.4777858703568825, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.80859375, "learning_rate": 0.0009972811810141787, "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 1123110.0, "repeat_count": 0.0, "routers_loss": 0.005229696165770292, "skip_count": 0.0, "step": 770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.489439184268027, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.000997240639551949, "loss": 0.0664, "macro_f1": 0.32863849401474, "num_tokens": 1126296.0, "repeat_count": 0.0, "routers_loss": 0.021092286333441734, "skip_count": 0.0, "step": 772, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 4.50109249817917, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.451171875, "learning_rate": 0.000997199798893368, "loss": 0.0412, "macro_f1": 0.8837606906890869, "num_tokens": 1129409.0, "repeat_count": 2.0, "routers_loss": 0.05397951230406761, "skip_count": 2.0, "step": 774, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.512745812090313, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.65625, "learning_rate": 0.0009971586590630092, "loss": 0.0541, "macro_f1": 0.5507246255874634, "num_tokens": 1131982.0, "repeat_count": 0.0, "routers_loss": 0.0660758763551712, "skip_count": 2.0, "step": 776, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.524399126001457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.0009971172200856277, "loss": 0.0332, "macro_f1": 0.3333333432674408, "num_tokens": 1135213.0, "repeat_count": 0.0, "routers_loss": 0.002685349667444825, "skip_count": 0.0, "step": 778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.55078125, "learning_rate": 0.0009970754819861576, "loss": 0.0688, "macro_f1": 0.3333333432674408, "num_tokens": 1138709.0, "repeat_count": 0.0, "routers_loss": 0.0056343418546020985, "skip_count": 0.0, "step": 780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 4.5477057538237435, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.0859375, "learning_rate": 0.0009970334447897136, "loss": 0.0672, "macro_f1": 0.5454546213150024, "num_tokens": 1141581.0, "repeat_count": 0.0, "routers_loss": 0.05613188073039055, "skip_count": 3.0, "step": 782, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.559359067734887, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62109375, "learning_rate": 0.0009969911085215895, "loss": 0.0487, "macro_f1": 0.3333333432674408, "num_tokens": 1144624.0, "repeat_count": 0.0, "routers_loss": 0.004509816411882639, "skip_count": 0.0, "step": 784, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.4000000059604645, "avg_layers": 34.0, "epoch": 4.571012381646031, "f1_execute": 0.9375, "f1_repeat": 0.0, "f1_skip": 0.5714285969734192, "grad_norm": 1.265625, "learning_rate": 0.00099694847320726, "loss": 0.0627, "macro_f1": 0.5029761791229248, "num_tokens": 1147227.0, "repeat_count": 1.0, "routers_loss": 0.07778463512659073, "skip_count": 5.0, "step": 786, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.582665695557174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.74609375, "learning_rate": 0.0009969055388723792, "loss": 0.0507, "macro_f1": 0.3333333432674408, "num_tokens": 1150123.0, "repeat_count": 0.0, "routers_loss": 0.0020941768307238817, "skip_count": 0.0, "step": 788, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.594319009468317, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7109375, "learning_rate": 0.0009968623055427812, "loss": 0.0573, "macro_f1": 0.32863849401474, "num_tokens": 1152780.0, "repeat_count": 0.0, "routers_loss": 0.012662393972277641, "skip_count": 0.0, "step": 790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.605972323379461, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.62109375, "learning_rate": 0.0009968187732444804, "loss": 0.0442, "macro_f1": 0.5507246255874634, "num_tokens": 1156237.0, "repeat_count": 0.0, "routers_loss": 0.04398556426167488, "skip_count": 2.0, "step": 792, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.9609375, "learning_rate": 0.0009967749420036703, "loss": 0.0602, "macro_f1": 0.3333333432674408, "num_tokens": 1158978.0, "repeat_count": 0.0, "routers_loss": 0.017803072929382324, "skip_count": 0.0, "step": 794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7890625, "learning_rate": 0.0009967308118467252, "loss": 0.0643, "macro_f1": 0.3333333432674408, "num_tokens": 1161624.0, "repeat_count": 0.0, "routers_loss": 0.005400592926889658, "skip_count": 0.0, "step": 796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.640932265112891, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.55859375, "learning_rate": 0.000996686382800198, "loss": 0.0521, "macro_f1": 0.32863849401474, "num_tokens": 1164649.0, "repeat_count": 1.0, "routers_loss": 0.07390843331813812, "skip_count": 0.0, "step": 798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.652585579024035, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.53125, "learning_rate": 0.0009966416548908236, "loss": 0.0384, "macro_f1": 0.3333333432674408, "num_tokens": 1167695.0, "repeat_count": 0.0, "routers_loss": 0.0107541149482131, "skip_count": 0.0, "step": 800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.664238892935178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.875, "learning_rate": 0.000996596628145514, "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 1170786.0, "repeat_count": 0.0, "routers_loss": 0.010526482947170734, "skip_count": 0.0, "step": 802, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.828125, "learning_rate": 0.0009965513025913635, "loss": 0.0556, "macro_f1": 0.3333333432674408, "num_tokens": 1174497.0, "repeat_count": 0.0, "routers_loss": 0.0044478015042841434, "skip_count": 0.0, "step": 804, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 4.687545520757466, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.0009965056782556443, "loss": 0.0653, "macro_f1": 0.3137255012989044, "num_tokens": 1177189.0, "repeat_count": 0.0, "routers_loss": 0.07509694248437881, "skip_count": 2.0, "step": 806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 35.0, "epoch": 4.699198834668609, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.77734375, "learning_rate": 0.0009964597551658095, "loss": 0.0528, "macro_f1": 0.44102567434310913, "num_tokens": 1179895.0, "repeat_count": 2.0, "routers_loss": 0.10644777864217758, "skip_count": 4.0, "step": 808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.710852148579752, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.421875, "learning_rate": 0.0009964135333494918, "loss": 0.0501, "macro_f1": 0.32380953431129456, "num_tokens": 1182637.0, "repeat_count": 0.0, "routers_loss": 0.09883294254541397, "skip_count": 2.0, "step": 810, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 4.722505462490896, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.66796875, "learning_rate": 0.000996367012834503, "loss": 0.0498, "macro_f1": 0.48507463932037354, "num_tokens": 1185542.0, "repeat_count": 1.0, "routers_loss": 0.06558815389871597, "skip_count": 3.0, "step": 812, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.734158776402039, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.392578125, "learning_rate": 0.0009963201936488354, "loss": 0.0423, "macro_f1": 0.32863849401474, "num_tokens": 1188448.0, "repeat_count": 0.0, "routers_loss": 0.018773503601551056, "skip_count": 0.0, "step": 814, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.745812090313183, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.77734375, "learning_rate": 0.0009962730758206612, "loss": 0.0798, "macro_f1": 0.32863849401474, "num_tokens": 1190795.0, "repeat_count": 0.0, "routers_loss": 0.020338093861937523, "skip_count": 0.0, "step": 816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.757465404224327, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.66015625, "learning_rate": 0.000996225659378331, "loss": 0.0559, "macro_f1": 0.5507246255874634, "num_tokens": 1193911.0, "repeat_count": 0.0, "routers_loss": 0.01845880039036274, "skip_count": 2.0, "step": 818, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.76911871813547, "f1_execute": 0.9705882668495178, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.62109375, "learning_rate": 0.0009961779443503763, "loss": 0.0566, "macro_f1": 0.656862735748291, "num_tokens": 1197455.0, "repeat_count": 1.0, "routers_loss": 0.044331956654787064, "skip_count": 2.0, "step": 820, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 4.780772032046613, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8125, "learning_rate": 0.0009961299307655077, "loss": 0.0504, "macro_f1": 0.3188405930995941, "num_tokens": 1200026.0, "repeat_count": 0.0, "routers_loss": 0.05816047266125679, "skip_count": 2.0, "step": 822, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 4.792425345957756, "f1_execute": 0.9090909361839294, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.78515625, "learning_rate": 0.0009960816186526161, "loss": 0.0663, "macro_f1": 0.3030303120613098, "num_tokens": 1202855.0, "repeat_count": 0.0, "routers_loss": 0.12788966298103333, "skip_count": 3.0, "step": 824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.8040786598689005, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.03125, "learning_rate": 0.000996033008040771, "loss": 0.0667, "macro_f1": 0.32863849401474, "num_tokens": 1205414.0, "repeat_count": 0.0, "routers_loss": 0.014318455010652542, "skip_count": 0.0, "step": 826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.815731973780044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.490234375, "learning_rate": 0.0009959840989592226, "loss": 0.0496, "macro_f1": 0.3333333432674408, "num_tokens": 1208477.0, "repeat_count": 0.0, "routers_loss": 0.007687705103307962, "skip_count": 0.0, "step": 828, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 4.827385287691187, "f1_execute": 0.9375, "f1_repeat": 0.5, "f1_skip": 0.5, "grad_norm": 0.6875, "learning_rate": 0.0009959348914373996, "loss": 0.1447, "macro_f1": 0.6458333730697632, "num_tokens": 1212126.0, "repeat_count": 3.0, "routers_loss": 0.22013415396213531, "skip_count": 1.0, "step": 830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.839038601602331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5078125, "learning_rate": 0.000995885385504911, "loss": 0.046, "macro_f1": 0.3333333432674408, "num_tokens": 1216147.0, "repeat_count": 0.0, "routers_loss": 0.011709926649928093, "skip_count": 0.0, "step": 832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 4.850691915513474, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.6015625, "learning_rate": 0.0009958355811915452, "loss": 0.0647, "macro_f1": 0.5406302213668823, "num_tokens": 1218999.0, "repeat_count": 2.0, "routers_loss": 0.08733268082141876, "skip_count": 2.0, "step": 834, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.862345229424617, "f1_execute": 0.9552239179611206, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.61328125, "learning_rate": 0.0009957854785272702, "loss": 0.0583, "macro_f1": 0.6517413258552551, "num_tokens": 1222695.0, "repeat_count": 1.0, "routers_loss": 0.04787394031882286, "skip_count": 2.0, "step": 836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.873998543335761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.76171875, "learning_rate": 0.0009957350775422335, "loss": 0.0586, "macro_f1": 0.3333333432674408, "num_tokens": 1225145.0, "repeat_count": 0.0, "routers_loss": 0.007553502917289734, "skip_count": 0.0, "step": 838, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 4.885651857246905, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.8359375, "learning_rate": 0.0009956843782667618, "loss": 0.047, "macro_f1": 0.928205132484436, "num_tokens": 1227771.0, "repeat_count": 1.0, "routers_loss": 0.030244583263993263, "skip_count": 2.0, "step": 840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 4.897305171158048, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.80859375, "learning_rate": 0.0009956333807313617, "loss": 0.0605, "macro_f1": 0.3188405930995941, "num_tokens": 1230629.0, "repeat_count": 0.0, "routers_loss": 0.09476426988840103, "skip_count": 2.0, "step": 842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 33.0, "epoch": 4.908958485069191, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.734375, "learning_rate": 0.0009955820849667193, "loss": 0.0605, "macro_f1": 0.3188405930995941, "num_tokens": 1233222.0, "repeat_count": 0.0, "routers_loss": 0.022804763168096542, "skip_count": 0.0, "step": 844, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6484375, "learning_rate": 0.0009955304910036994, "loss": 0.0572, "macro_f1": 0.3333333432674408, "num_tokens": 1236075.0, "repeat_count": 0.0, "routers_loss": 0.011213175021111965, "skip_count": 0.0, "step": 846, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 4.9322651128914785, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.54296875, "learning_rate": 0.0009954785988733473, "loss": 0.0493, "macro_f1": 0.32380953431129456, "num_tokens": 1239173.0, "repeat_count": 0.0, "routers_loss": 0.03655659034848213, "skip_count": 0.0, "step": 848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.54296875, "learning_rate": 0.0009954264086068868, "loss": 0.0482, "macro_f1": 0.3333333432674408, "num_tokens": 1241861.0, "repeat_count": 0.0, "routers_loss": 0.003702334361150861, "skip_count": 0.0, "step": 850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.955571740713766, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.61328125, "learning_rate": 0.0009953739202357217, "loss": 0.0811, "macro_f1": 0.32380953431129456, "num_tokens": 1244837.0, "repeat_count": 1.0, "routers_loss": 0.09158194065093994, "skip_count": 1.0, "step": 852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.967225054624909, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.55078125, "learning_rate": 0.0009953211337914351, "loss": 0.0496, "macro_f1": 0.3333333432674408, "num_tokens": 1248246.0, "repeat_count": 0.0, "routers_loss": 0.006052245851606131, "skip_count": 0.0, "step": 854, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.978878368536052, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.015625, "learning_rate": 0.0009952680493057892, "loss": 0.057, "macro_f1": 0.32863849401474, "num_tokens": 1250957.0, "repeat_count": 0.0, "routers_loss": 0.08091381192207336, "skip_count": 1.0, "step": 856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 4.990531682447196, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0, "learning_rate": 0.0009952146668107255, "loss": 0.0588, "macro_f1": 0.3333333432674408, "num_tokens": 1253503.0, "repeat_count": 0.0, "routers_loss": 0.006424468010663986, "skip_count": 0.0, "step": 858, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.60546875, "learning_rate": 0.0009951609863383649, "loss": 0.03, "macro_f1": 0.6666666865348816, "num_tokens": 1256080.0, "repeat_count": 1.0, "routers_loss": 0.005914467852562666, "skip_count": 0.0, "step": 860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.011653313911143, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.51171875, "learning_rate": 0.0009951070079210079, "loss": 0.0529, "macro_f1": 0.32863849401474, "num_tokens": 1259064.0, "repeat_count": 0.0, "routers_loss": 0.014915433712303638, "skip_count": 0.0, "step": 862, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 37.0, "epoch": 5.023306627822287, "f1_execute": 0.9375, "f1_repeat": 0.5, "f1_skip": 0.5, "grad_norm": 0.7734375, "learning_rate": 0.000995052731591134, "loss": 0.0472, "macro_f1": 0.6458333730697632, "num_tokens": 1261772.0, "repeat_count": 2.0, "routers_loss": 0.07534078508615494, "skip_count": 3.0, "step": 864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.0349599417334305, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7265625, "learning_rate": 0.000994998157381402, "loss": 0.0485, "macro_f1": 0.32863849401474, "num_tokens": 1264445.0, "repeat_count": 0.0, "routers_loss": 0.03789233788847923, "skip_count": 0.0, "step": 866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 5.046613255644574, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.4296875, "learning_rate": 0.0009949432853246496, "loss": 0.0328, "macro_f1": 0.5507246255874634, "num_tokens": 1267026.0, "repeat_count": 0.0, "routers_loss": 0.028946662321686745, "skip_count": 2.0, "step": 868, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62109375, "learning_rate": 0.0009948881154538945, "loss": 0.0576, "macro_f1": 0.3333333432674408, "num_tokens": 1270131.0, "repeat_count": 0.0, "routers_loss": 0.006060255691409111, "skip_count": 0.0, "step": 870, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.734375, "learning_rate": 0.000994832647802333, "loss": 0.0421, "macro_f1": 0.3333333432674408, "num_tokens": 1273294.0, "repeat_count": 0.0, "routers_loss": 0.005227828864008188, "skip_count": 0.0, "step": 872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.081573197378004, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.671875, "learning_rate": 0.0009947768824033407, "loss": 0.0412, "macro_f1": 0.32863849401474, "num_tokens": 1276446.0, "repeat_count": 0.0, "routers_loss": 0.012527895160019398, "skip_count": 0.0, "step": 874, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 5.0932265112891475, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.72265625, "learning_rate": 0.0009947208192904722, "loss": 0.041, "macro_f1": 0.5406302213668823, "num_tokens": 1279190.0, "repeat_count": 0.0, "routers_loss": 0.05703629553318024, "skip_count": 2.0, "step": 876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 5.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.53125, "learning_rate": 0.0009946644584974617, "loss": 0.0353, "macro_f1": 0.6666666865348816, "num_tokens": 1281972.0, "repeat_count": 0.0, "routers_loss": 0.005276104435324669, "skip_count": 2.0, "step": 878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.116533139111435, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.515625, "learning_rate": 0.0009946078000582224, "loss": 0.0305, "macro_f1": 0.32863849401474, "num_tokens": 1284601.0, "repeat_count": 0.0, "routers_loss": 0.08842003345489502, "skip_count": 1.0, "step": 880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 34.0, "epoch": 5.128186453022578, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.80859375, "learning_rate": 0.000994550844006846, "loss": 0.0397, "macro_f1": 0.4517413079738617, "num_tokens": 1286997.0, "repeat_count": 0.0, "routers_loss": 0.03315761312842369, "skip_count": 3.0, "step": 882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 5.139839766933722, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.46484375, "learning_rate": 0.000994493590377604, "loss": 0.0412, "macro_f1": 0.5507246255874634, "num_tokens": 1289665.0, "repeat_count": 0.0, "routers_loss": 0.008611015044152737, "skip_count": 1.0, "step": 884, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.384765625, "learning_rate": 0.0009944360392049466, "loss": 0.0286, "macro_f1": 0.3333333432674408, "num_tokens": 1293142.0, "repeat_count": 0.0, "routers_loss": 0.00379526917822659, "skip_count": 0.0, "step": 886, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.163146394756009, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.39453125, "learning_rate": 0.000994378190523503, "loss": 0.0436, "macro_f1": 0.32863849401474, "num_tokens": 1295908.0, "repeat_count": 0.0, "routers_loss": 0.012087931856513023, "skip_count": 1.0, "step": 888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 5.174799708667152, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.625, "learning_rate": 0.0009943200443680816, "loss": 0.0368, "macro_f1": 0.3137255012989044, "num_tokens": 1299183.0, "repeat_count": 0.0, "routers_loss": 0.09219954162836075, "skip_count": 2.0, "step": 890, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.186453022578296, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7421875, "learning_rate": 0.0009942616007736697, "loss": 0.0728, "macro_f1": 0.3333333432674408, "num_tokens": 1301744.0, "repeat_count": 0.0, "routers_loss": 0.0038045563269406557, "skip_count": 0.0, "step": 892, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 5.198106336489439, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.53515625, "learning_rate": 0.000994202859775434, "loss": 0.0348, "macro_f1": 0.4901960790157318, "num_tokens": 1304663.0, "repeat_count": 0.0, "routers_loss": 0.03482520952820778, "skip_count": 2.0, "step": 894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.209759650400582, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5859375, "learning_rate": 0.000994143821408719, "loss": 0.0348, "macro_f1": 0.3333333432674408, "num_tokens": 1307441.0, "repeat_count": 0.0, "routers_loss": 0.004592279437929392, "skip_count": 0.0, "step": 896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.221412964311726, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62890625, "learning_rate": 0.0009940844857090497, "loss": 0.037, "macro_f1": 0.32380953431129456, "num_tokens": 1310179.0, "repeat_count": 1.0, "routers_loss": 0.03899485617876053, "skip_count": 1.0, "step": 898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.349609375, "learning_rate": 0.0009940248527121284, "loss": 0.0301, "macro_f1": 0.3333333432674408, "num_tokens": 1313062.0, "repeat_count": 0.0, "routers_loss": 0.0034514665603637695, "skip_count": 0.0, "step": 900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.546875, "learning_rate": 0.0009939649224538374, "loss": 0.0299, "macro_f1": 0.3333333432674408, "num_tokens": 1316992.0, "repeat_count": 0.0, "routers_loss": 0.004644820932298899, "skip_count": 0.0, "step": 902, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 5.256372906045157, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.7734375, "learning_rate": 0.0009939046949702377, "loss": 0.0377, "macro_f1": 0.6666666865348816, "num_tokens": 1319222.0, "repeat_count": 0.0, "routers_loss": 0.01724315620958805, "skip_count": 1.0, "step": 904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 5.2680262199563, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.4921875, "learning_rate": 0.0009938441702975688, "loss": 0.048, "macro_f1": 0.5507246255874634, "num_tokens": 1321920.0, "repeat_count": 0.0, "routers_loss": 0.022490784525871277, "skip_count": 2.0, "step": 906, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.279679533867443, "f1_execute": 0.9552239179611206, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 1.09375, "learning_rate": 0.0009937833484722494, "loss": 0.065, "macro_f1": 0.5406302213668823, "num_tokens": 1324931.0, "repeat_count": 2.0, "routers_loss": 0.12859506905078888, "skip_count": 1.0, "step": 908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 5.2913328477785875, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.59765625, "learning_rate": 0.0009937222295308766, "loss": 0.0351, "macro_f1": 0.4901960790157318, "num_tokens": 1327962.0, "repeat_count": 0.0, "routers_loss": 0.08273611217737198, "skip_count": 2.0, "step": 910, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.302986161689731, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.51171875, "learning_rate": 0.0009936608135102264, "loss": 0.0381, "macro_f1": 0.3333333432674408, "num_tokens": 1330837.0, "repeat_count": 0.0, "routers_loss": 0.00689684646204114, "skip_count": 0.0, "step": 912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.59765625, "learning_rate": 0.0009935991004472538, "loss": 0.0483, "macro_f1": 0.3333333432674408, "num_tokens": 1333751.0, "repeat_count": 0.0, "routers_loss": 0.004771741572767496, "skip_count": 0.0, "step": 914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.326292789512017, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3046875, "learning_rate": 0.0009935370903790925, "loss": 0.0304, "macro_f1": 0.32863849401474, "num_tokens": 1336804.0, "repeat_count": 0.0, "routers_loss": 0.019442904740571976, "skip_count": 0.0, "step": 916, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 36.0, "epoch": 5.337946103423161, "f1_execute": 0.9538461565971375, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.78125, "learning_rate": 0.0009934747833430545, "loss": 0.0549, "macro_f1": 0.7846153974533081, "num_tokens": 1339326.0, "repeat_count": 1.0, "routers_loss": 0.0536203570663929, "skip_count": 4.0, "step": 918, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 5.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.640625, "learning_rate": 0.0009934121793766311, "loss": 0.0462, "macro_f1": 0.6666666865348816, "num_tokens": 1342740.0, "repeat_count": 0.0, "routers_loss": 0.00730834249407053, "skip_count": 2.0, "step": 920, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.361252731245448, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.361328125, "learning_rate": 0.0009933492785174918, "loss": 0.0329, "macro_f1": 0.3333333432674408, "num_tokens": 1345554.0, "repeat_count": 0.0, "routers_loss": 0.009744085371494293, "skip_count": 0.0, "step": 922, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 5.372906045156592, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.7421875, "learning_rate": 0.0009932860808034847, "loss": 0.0411, "macro_f1": 0.6666666865348816, "num_tokens": 1347796.0, "repeat_count": 0.0, "routers_loss": 0.006334028206765652, "skip_count": 2.0, "step": 924, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.384559359067735, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.41796875, "learning_rate": 0.000993222586272637, "loss": 0.0279, "macro_f1": 0.32863849401474, "num_tokens": 1350615.0, "repeat_count": 0.0, "routers_loss": 0.013932809233665466, "skip_count": 0.0, "step": 926, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.396212672978878, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.390625, "learning_rate": 0.000993158794963154, "loss": 0.0785, "macro_f1": 0.6666666865348816, "num_tokens": 1353524.0, "repeat_count": 1.0, "routers_loss": 0.005696198437362909, "skip_count": 0.0, "step": 928, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.407865986890021, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.53125, "learning_rate": 0.0009930947069134199, "loss": 0.0382, "macro_f1": 0.32863849401474, "num_tokens": 1356333.0, "repeat_count": 0.0, "routers_loss": 0.014644813723862171, "skip_count": 1.0, "step": 930, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.4195193008011655, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.8203125, "learning_rate": 0.0009930303221619974, "loss": 0.0592, "macro_f1": 0.32380953431129456, "num_tokens": 1359603.0, "repeat_count": 0.0, "routers_loss": 0.020437119528651237, "skip_count": 2.0, "step": 932, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 5.431172614712309, "f1_execute": 0.9375, "f1_repeat": 0.5, "f1_skip": 0.5, "grad_norm": 0.76171875, "learning_rate": 0.0009929656407476274, "loss": 0.0404, "macro_f1": 0.6458333730697632, "num_tokens": 1362590.0, "repeat_count": 3.0, "routers_loss": 0.0769520178437233, "skip_count": 3.0, "step": 934, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 5.442825928623452, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.4765625, "learning_rate": 0.0009929006627092298, "loss": 0.0355, "macro_f1": 0.5507246255874634, "num_tokens": 1366417.0, "repeat_count": 0.0, "routers_loss": 0.026866260915994644, "skip_count": 2.0, "step": 936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5625, "learning_rate": 0.000992835388085903, "loss": 0.0355, "macro_f1": 0.3333333432674408, "num_tokens": 1369204.0, "repeat_count": 0.0, "routers_loss": 0.004782802890986204, "skip_count": 0.0, "step": 938, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.466132556445739, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.57421875, "learning_rate": 0.000992769816916923, "loss": 0.0304, "macro_f1": 0.32863849401474, "num_tokens": 1372173.0, "repeat_count": 0.0, "routers_loss": 0.012379567138850689, "skip_count": 0.0, "step": 940, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 31.0, "epoch": 5.4777858703568825, "f1_execute": 0.9508196711540222, "f1_repeat": 0.0, "f1_skip": 0.7272727489471436, "grad_norm": 0.55859375, "learning_rate": 0.000992703949241745, "loss": 0.0357, "macro_f1": 0.5593641996383667, "num_tokens": 1374935.0, "repeat_count": 0.0, "routers_loss": 0.06630527973175049, "skip_count": 6.0, "step": 942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 34.0, "epoch": 5.489439184268027, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.5703125, "learning_rate": 0.000992637785100003, "loss": 0.0438, "macro_f1": 0.4517413079738617, "num_tokens": 1378673.0, "repeat_count": 0.0, "routers_loss": 0.06893254816532135, "skip_count": 3.0, "step": 944, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 5.50109249817917, "f1_execute": 0.9696969985961914, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.515625, "learning_rate": 0.0009925713245315083, "loss": 0.043, "macro_f1": 0.8232323527336121, "num_tokens": 1381499.0, "repeat_count": 1.0, "routers_loss": 0.05063093081116676, "skip_count": 2.0, "step": 946, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 34.0, "epoch": 5.512745812090313, "f1_execute": 0.90625, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 0.71484375, "learning_rate": 0.0009925045675762514, "loss": 0.0422, "macro_f1": 0.413194477558136, "num_tokens": 1384122.0, "repeat_count": 2.0, "routers_loss": 0.2188098430633545, "skip_count": 4.0, "step": 948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.524399126001457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.392578125, "learning_rate": 0.0009924375142744007, "loss": 0.0317, "macro_f1": 0.3333333432674408, "num_tokens": 1386955.0, "repeat_count": 0.0, "routers_loss": 0.011038658209145069, "skip_count": 0.0, "step": 950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 5.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.5, "learning_rate": 0.000992370164666303, "loss": 0.0368, "macro_f1": 0.6666666865348816, "num_tokens": 1390176.0, "repeat_count": 0.0, "routers_loss": 0.012464593164622784, "skip_count": 2.0, "step": 952, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.5477057538237435, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.67578125, "learning_rate": 0.0009923025187924836, "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 1392820.0, "repeat_count": 0.0, "routers_loss": 0.010769741609692574, "skip_count": 0.0, "step": 954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 34.0, "epoch": 5.559359067734887, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.71875, "learning_rate": 0.0009922345766936462, "loss": 0.043, "macro_f1": 0.32380953431129456, "num_tokens": 1395418.0, "repeat_count": 0.0, "routers_loss": 0.10096780210733414, "skip_count": 0.0, "step": 956, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.4285714328289032, "avg_layers": 33.0, "epoch": 5.571012381646031, "f1_execute": 0.9354838728904724, "f1_repeat": 0.0, "f1_skip": 0.5999999642372131, "grad_norm": 0.478515625, "learning_rate": 0.000992166338410672, "loss": 0.0433, "macro_f1": 0.5118279457092285, "num_tokens": 1399151.0, "repeat_count": 0.0, "routers_loss": 0.10303844511508942, "skip_count": 7.0, "step": 958, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.582665695557174, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.314453125, "learning_rate": 0.000992097803984621, "loss": 0.0238, "macro_f1": 0.32863849401474, "num_tokens": 1402034.0, "repeat_count": 0.0, "routers_loss": 0.012579351663589478, "skip_count": 1.0, "step": 960, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 5.594319009468317, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.326171875, "learning_rate": 0.0009920289734567313, "loss": 0.0463, "macro_f1": 0.8839138746261597, "num_tokens": 1404346.0, "repeat_count": 1.0, "routers_loss": 0.030148671939969063, "skip_count": 2.0, "step": 962, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.605972323379461, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6953125, "learning_rate": 0.0009919598468684195, "loss": 0.0415, "macro_f1": 0.32863849401474, "num_tokens": 1406994.0, "repeat_count": 0.0, "routers_loss": 0.012787901796400547, "skip_count": 0.0, "step": 964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 5.617625637290605, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.671875, "learning_rate": 0.0009918904242612794, "loss": 0.042, "macro_f1": 0.4901960790157318, "num_tokens": 1409814.0, "repeat_count": 0.0, "routers_loss": 0.06239202246069908, "skip_count": 3.0, "step": 966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.609375, "learning_rate": 0.0009918207056770839, "loss": 0.0414, "macro_f1": 0.3333333432674408, "num_tokens": 1412585.0, "repeat_count": 0.0, "routers_loss": 0.00709152827039361, "skip_count": 0.0, "step": 968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.640932265112891, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5703125, "learning_rate": 0.0009917506911577836, "loss": 0.048, "macro_f1": 0.32863849401474, "num_tokens": 1415426.0, "repeat_count": 0.0, "routers_loss": 0.03353523090481758, "skip_count": 1.0, "step": 970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.652585579024035, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5234375, "learning_rate": 0.000991680380745507, "loss": 0.0329, "macro_f1": 0.3333333432674408, "num_tokens": 1417963.0, "repeat_count": 0.0, "routers_loss": 0.004209158476442099, "skip_count": 0.0, "step": 972, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.664238892935178, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.59375, "learning_rate": 0.0009916097744825608, "loss": 0.0528, "macro_f1": 0.6666666865348816, "num_tokens": 1420814.0, "repeat_count": 1.0, "routers_loss": 0.005969550926238298, "skip_count": 0.0, "step": 974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 34.0, "epoch": 5.675892206846322, "f1_execute": 0.9375, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.55078125, "learning_rate": 0.0009915388724114301, "loss": 0.0406, "macro_f1": 0.4791666865348816, "num_tokens": 1424490.0, "repeat_count": 0.0, "routers_loss": 0.15840749442577362, "skip_count": 6.0, "step": 976, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.687545520757466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.59765625, "learning_rate": 0.0009914676745747771, "loss": 0.0504, "macro_f1": 0.3333333432674408, "num_tokens": 1428764.0, "repeat_count": 0.0, "routers_loss": 0.02140953205525875, "skip_count": 0.0, "step": 978, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.699198834668609, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.59375, "learning_rate": 0.000991396181015443, "loss": 0.0407, "macro_f1": 0.32863849401474, "num_tokens": 1431379.0, "repeat_count": 0.0, "routers_loss": 0.019209571182727814, "skip_count": 0.0, "step": 980, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.710852148579752, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.703125, "learning_rate": 0.000991324391776446, "loss": 0.0443, "macro_f1": 0.32380953431129456, "num_tokens": 1434264.0, "repeat_count": 0.0, "routers_loss": 0.09184425324201584, "skip_count": 1.0, "step": 982, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 5.722505462490896, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.8359375, "learning_rate": 0.000991252306900983, "loss": 0.0464, "macro_f1": 0.8837606906890869, "num_tokens": 1436984.0, "repeat_count": 2.0, "routers_loss": 0.05655256286263466, "skip_count": 2.0, "step": 984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.734158776402039, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5390625, "learning_rate": 0.000991179926432428, "loss": 0.0294, "macro_f1": 0.32863849401474, "num_tokens": 1440945.0, "repeat_count": 0.0, "routers_loss": 0.019932935014367104, "skip_count": 0.0, "step": 986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.745812090313183, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.71484375, "learning_rate": 0.0009911072504143338, "loss": 0.0419, "macro_f1": 0.3137255311012268, "num_tokens": 1444055.0, "repeat_count": 1.0, "routers_loss": 0.13457120954990387, "skip_count": 2.0, "step": 988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.757465404224327, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.60546875, "learning_rate": 0.0009910342788904303, "loss": 0.052, "macro_f1": 0.32380953431129456, "num_tokens": 1446670.0, "repeat_count": 0.0, "routers_loss": 0.023401308804750443, "skip_count": 2.0, "step": 990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 5.76911871813547, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.9609375, "learning_rate": 0.0009909610119046253, "loss": 0.0399, "macro_f1": 0.4901960790157318, "num_tokens": 1449350.0, "repeat_count": 0.0, "routers_loss": 0.043829914182424545, "skip_count": 2.0, "step": 992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 5.780772032046613, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.56640625, "learning_rate": 0.0009908874495010048, "loss": 0.0277, "macro_f1": 0.5507246255874634, "num_tokens": 1452397.0, "repeat_count": 0.0, "routers_loss": 0.010866532102227211, "skip_count": 2.0, "step": 994, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.792425345957756, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.486328125, "learning_rate": 0.000990813591723832, "loss": 0.0339, "macro_f1": 0.32380953431129456, "num_tokens": 1456003.0, "repeat_count": 1.0, "routers_loss": 0.04475317895412445, "skip_count": 0.0, "step": 996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 5.8040786598689005, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.50390625, "learning_rate": 0.0009907394386175483, "loss": 0.0393, "macro_f1": 0.656862735748291, "num_tokens": 1458542.0, "repeat_count": 2.0, "routers_loss": 0.025290438905358315, "skip_count": 1.0, "step": 998, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 5.815731973780044, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.54296875, "learning_rate": 0.0009906649902267728, "loss": 0.037, "macro_f1": 0.661835789680481, "num_tokens": 1461430.0, "repeat_count": 1.0, "routers_loss": 0.00995925534516573, "skip_count": 1.0, "step": 1000, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.827385287691187, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.466796875, "learning_rate": 0.0009905902465963015, "loss": 0.0388, "macro_f1": 0.32863849401474, "num_tokens": 1464050.0, "repeat_count": 0.0, "routers_loss": 0.021144574508070946, "skip_count": 1.0, "step": 1002, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 5.839038601602331, "f1_execute": 0.9491525292396545, "f1_repeat": 0.6666666865348816, "f1_skip": 0.800000011920929, "grad_norm": 0.73828125, "learning_rate": 0.0009905152077711093, "loss": 0.0402, "macro_f1": 0.8052730560302734, "num_tokens": 1467675.0, "repeat_count": 2.0, "routers_loss": 0.04510732367634773, "skip_count": 6.0, "step": 1004, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 5.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.57421875, "learning_rate": 0.0009904398737963479, "loss": 0.0354, "macro_f1": 0.6666666865348816, "num_tokens": 1470603.0, "repeat_count": 0.0, "routers_loss": 0.010821451433002949, "skip_count": 1.0, "step": 1006, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.862345229424617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.96484375, "learning_rate": 0.0009903642447173465, "loss": 0.0549, "macro_f1": 0.3333333432674408, "num_tokens": 1473383.0, "repeat_count": 0.0, "routers_loss": 0.007003359496593475, "skip_count": 0.0, "step": 1008, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 5.873998543335761, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.37890625, "learning_rate": 0.000990288320579612, "loss": 0.0449, "macro_f1": 0.661835789680481, "num_tokens": 1476036.0, "repeat_count": 1.0, "routers_loss": 0.0361940898001194, "skip_count": 1.0, "step": 1010, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 5.885651857246905, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.54296875, "learning_rate": 0.0009902121014288296, "loss": 0.0347, "macro_f1": 0.5507246255874634, "num_tokens": 1479663.0, "repeat_count": 0.0, "routers_loss": 0.07552403956651688, "skip_count": 2.0, "step": 1012, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.897305171158048, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.365234375, "learning_rate": 0.000990135587310861, "loss": 0.0379, "macro_f1": 0.3333333432674408, "num_tokens": 1482566.0, "repeat_count": 0.0, "routers_loss": 0.012141061015427113, "skip_count": 0.0, "step": 1014, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 35.0, "epoch": 5.908958485069191, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.482421875, "learning_rate": 0.0009900587782717457, "loss": 0.0388, "macro_f1": 0.4517413079738617, "num_tokens": 1485499.0, "repeat_count": 0.0, "routers_loss": 0.054710108786821365, "skip_count": 4.0, "step": 1016, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.431640625, "learning_rate": 0.0009899816743577008, "loss": 0.0352, "macro_f1": 0.3333333432674408, "num_tokens": 1488752.0, "repeat_count": 0.0, "routers_loss": 0.011138529516756535, "skip_count": 0.0, "step": 1018, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 5.9322651128914785, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.52734375, "learning_rate": 0.0009899042756151208, "loss": 0.0458, "macro_f1": 0.32380953431129456, "num_tokens": 1491613.0, "repeat_count": 0.0, "routers_loss": 0.03591640666127205, "skip_count": 1.0, "step": 1020, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.408203125, "learning_rate": 0.0009898265820905776, "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1494427.0, "repeat_count": 0.0, "routers_loss": 0.011490537784993649, "skip_count": 0.0, "step": 1022, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.955571740713766, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.80078125, "learning_rate": 0.0009897485938308204, "loss": 0.044, "macro_f1": 0.3333333432674408, "num_tokens": 1497108.0, "repeat_count": 0.0, "routers_loss": 0.002494579181075096, "skip_count": 0.0, "step": 1024, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 5.967225054624909, "f1_execute": 0.920634925365448, "f1_repeat": 0.5, "f1_skip": 0.4000000059604645, "grad_norm": 0.6171875, "learning_rate": 0.000989670310882776, "loss": 0.0435, "macro_f1": 0.6068782806396484, "num_tokens": 1499852.0, "repeat_count": 3.0, "routers_loss": 0.22367845475673676, "skip_count": 3.0, "step": 1026, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.978878368536052, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5625, "learning_rate": 0.0009895917332935476, "loss": 0.0425, "macro_f1": 0.3137255012989044, "num_tokens": 1502532.0, "repeat_count": 0.0, "routers_loss": 0.06451278179883957, "skip_count": 2.0, "step": 1028, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 5.990531682447196, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2109375, "learning_rate": 0.000989512861110417, "loss": 0.0676, "macro_f1": 0.32863849401474, "num_tokens": 1505126.0, "repeat_count": 0.0, "routers_loss": 0.32084858417510986, "skip_count": 1.0, "step": 1030, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.0, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5546875, "learning_rate": 0.0009894336943808426, "loss": 0.0395, "macro_f1": 0.32863849401474, "num_tokens": 1507296.0, "repeat_count": 0.0, "routers_loss": 0.014526973478496075, "skip_count": 0.0, "step": 1032, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 6.011653313911143, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.302734375, "learning_rate": 0.0009893542331524598, "loss": 0.0182, "macro_f1": 1.0, "num_tokens": 1510215.0, "repeat_count": 1.0, "routers_loss": 0.005060628522187471, "skip_count": 1.0, "step": 1034, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.023306627822287, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.77734375, "learning_rate": 0.0009892744774730817, "loss": 0.026, "macro_f1": 0.32380953431129456, "num_tokens": 1513257.0, "repeat_count": 1.0, "routers_loss": 0.04398665204644203, "skip_count": 0.0, "step": 1036, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.0349599417334305, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.453125, "learning_rate": 0.0009891944273906985, "loss": 0.0279, "macro_f1": 0.5507246255874634, "num_tokens": 1516378.0, "repeat_count": 0.0, "routers_loss": 0.032597463577985764, "skip_count": 2.0, "step": 1038, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 6.046613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.408203125, "learning_rate": 0.0009891140829534771, "loss": 0.0298, "macro_f1": 0.6666666865348816, "num_tokens": 1519140.0, "repeat_count": 0.0, "routers_loss": 0.01456831581890583, "skip_count": 2.0, "step": 1040, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 33.0, "epoch": 6.058266569555718, "f1_execute": 0.9687499403953552, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 0.55859375, "learning_rate": 0.000989033444209762, "loss": 0.0341, "macro_f1": 0.5729166865348816, "num_tokens": 1521858.0, "repeat_count": 0.0, "routers_loss": 0.08104380965232849, "skip_count": 5.0, "step": 1042, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 6.069919883466861, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2236328125, "learning_rate": 0.0009889525112080745, "loss": 0.0214, "macro_f1": 0.6666666865348816, "num_tokens": 1524862.0, "repeat_count": 1.0, "routers_loss": 0.004747647326439619, "skip_count": 0.0, "step": 1044, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3359375, "learning_rate": 0.0009888712839971133, "loss": 0.0193, "macro_f1": 0.3333333432674408, "num_tokens": 1528320.0, "repeat_count": 0.0, "routers_loss": 0.004961361642926931, "skip_count": 0.0, "step": 1046, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.0932265112891475, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.8984375, "learning_rate": 0.0009887897626257537, "loss": 0.0344, "macro_f1": 0.5507246255874634, "num_tokens": 1530920.0, "repeat_count": 0.0, "routers_loss": 0.018271654844284058, "skip_count": 2.0, "step": 1048, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.70703125, "learning_rate": 0.000988707947143048, "loss": 0.0252, "macro_f1": 0.3333333432674408, "num_tokens": 1533581.0, "repeat_count": 0.0, "routers_loss": 0.011218656785786152, "skip_count": 0.0, "step": 1050, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 6.116533139111435, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.455078125, "learning_rate": 0.000988625837598226, "loss": 0.0331, "macro_f1": 0.5950249433517456, "num_tokens": 1537291.0, "repeat_count": 0.0, "routers_loss": 0.0657576322555542, "skip_count": 3.0, "step": 1052, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.128186453022578, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.60546875, "learning_rate": 0.000988543434040694, "loss": 0.0319, "macro_f1": 0.5507246255874634, "num_tokens": 1540247.0, "repeat_count": 0.0, "routers_loss": 0.044745173305273056, "skip_count": 2.0, "step": 1054, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.139839766933722, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.64453125, "learning_rate": 0.0009884607365200355, "loss": 0.039, "macro_f1": 0.32863849401474, "num_tokens": 1543156.0, "repeat_count": 1.0, "routers_loss": 0.02763236314058304, "skip_count": 0.0, "step": 1056, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.151493080844865, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.70703125, "learning_rate": 0.0009883777450860106, "loss": 0.0435, "macro_f1": 0.5507246255874634, "num_tokens": 1546657.0, "repeat_count": 0.0, "routers_loss": 0.03906804695725441, "skip_count": 2.0, "step": 1058, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 6.163146394756009, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.68359375, "learning_rate": 0.000988294459788556, "loss": 0.0336, "macro_f1": 0.545751690864563, "num_tokens": 1549234.0, "repeat_count": 0.0, "routers_loss": 0.047379035502672195, "skip_count": 2.0, "step": 1060, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.174799708667152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.55078125, "learning_rate": 0.0009882108806777861, "loss": 0.032, "macro_f1": 0.3333333432674408, "num_tokens": 1552377.0, "repeat_count": 0.0, "routers_loss": 0.00422747852280736, "skip_count": 0.0, "step": 1062, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.186453022578296, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.453125, "learning_rate": 0.0009881270078039913, "loss": 0.0251, "macro_f1": 1.0, "num_tokens": 1555408.0, "repeat_count": 1.0, "routers_loss": 0.01011748518794775, "skip_count": 2.0, "step": 1064, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.198106336489439, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.578125, "learning_rate": 0.0009880428412176391, "loss": 0.0344, "macro_f1": 0.3333333432674408, "num_tokens": 1558141.0, "repeat_count": 0.0, "routers_loss": 0.003121395595371723, "skip_count": 0.0, "step": 1066, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.209759650400582, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.57421875, "learning_rate": 0.0009879583809693738, "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 1561010.0, "repeat_count": 0.0, "routers_loss": 0.008257124572992325, "skip_count": 1.0, "step": 1068, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.221412964311726, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.0009878736271100158, "loss": 0.0201, "macro_f1": 0.32863849401474, "num_tokens": 1565137.0, "repeat_count": 1.0, "routers_loss": 0.01921830326318741, "skip_count": 0.0, "step": 1070, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.23306627822287, "f1_execute": 0.9836065173149109, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.361328125, "learning_rate": 0.0009877885796905632, "loss": 0.0398, "macro_f1": 0.9278688430786133, "num_tokens": 1567727.0, "repeat_count": 3.0, "routers_loss": 0.018740251660346985, "skip_count": 3.0, "step": 1072, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 6.244719592134013, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.408203125, "learning_rate": 0.0009877032387621899, "loss": 0.0269, "macro_f1": 0.32863849401474, "num_tokens": 1570638.0, "repeat_count": 0.0, "routers_loss": 0.012496289797127247, "skip_count": 0.0, "step": 1074, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.256372906045157, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.95703125, "learning_rate": 0.0009876176043762466, "loss": 0.0362, "macro_f1": 0.3188405930995941, "num_tokens": 1573620.0, "repeat_count": 0.0, "routers_loss": 0.07265384495258331, "skip_count": 1.0, "step": 1076, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62109375, "learning_rate": 0.0009875316765842609, "loss": 0.0449, "macro_f1": 0.3333333432674408, "num_tokens": 1577157.0, "repeat_count": 0.0, "routers_loss": 0.003356463508680463, "skip_count": 0.0, "step": 1078, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 6.279679533867443, "f1_execute": 0.9841269850730896, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.49609375, "learning_rate": 0.0009874454554379363, "loss": 0.0258, "macro_f1": 0.6613757014274597, "num_tokens": 1580934.0, "repeat_count": 1.0, "routers_loss": 0.057081568986177444, "skip_count": 4.0, "step": 1080, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.2913328477785875, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7890625, "learning_rate": 0.0009873589409891536, "loss": 0.0422, "macro_f1": 0.32863849401474, "num_tokens": 1583415.0, "repeat_count": 0.0, "routers_loss": 0.028107358142733574, "skip_count": 1.0, "step": 1082, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.302986161689731, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4609375, "learning_rate": 0.0009872721332899697, "loss": 0.0364, "macro_f1": 0.3333333432674408, "num_tokens": 1586924.0, "repeat_count": 0.0, "routers_loss": 0.010826284997165203, "skip_count": 0.0, "step": 1084, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.314639475600874, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.39453125, "learning_rate": 0.0009871850323926177, "loss": 0.0275, "macro_f1": 0.32380953431129456, "num_tokens": 1589754.0, "repeat_count": 0.0, "routers_loss": 0.04788600653409958, "skip_count": 1.0, "step": 1086, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.326292789512017, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.53125, "learning_rate": 0.0009870976383495073, "loss": 0.025, "macro_f1": 0.6616915464401245, "num_tokens": 1592715.0, "repeat_count": 0.0, "routers_loss": 0.06787339597940445, "skip_count": 2.0, "step": 1088, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 33.0, "epoch": 6.337946103423161, "f1_execute": 0.9354838728904724, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.80078125, "learning_rate": 0.0009870099512132254, "loss": 0.0373, "macro_f1": 0.5340502262115479, "num_tokens": 1595612.0, "repeat_count": 1.0, "routers_loss": 0.10050515085458755, "skip_count": 6.0, "step": 1090, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.33984375, "learning_rate": 0.0009869219710365335, "loss": 0.0303, "macro_f1": 0.6666666865348816, "num_tokens": 1598477.0, "repeat_count": 0.0, "routers_loss": 0.022984815761446953, "skip_count": 1.0, "step": 1092, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.361252731245448, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.0009868336978723711, "loss": 0.0287, "macro_f1": 0.3333333432674408, "num_tokens": 1601541.0, "repeat_count": 0.0, "routers_loss": 0.0035244624596089125, "skip_count": 0.0, "step": 1094, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 6.372906045156592, "f1_execute": 0.9687499403953552, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.4453125, "learning_rate": 0.0009867451317738534, "loss": 0.0384, "macro_f1": 0.6086309552192688, "num_tokens": 1604396.0, "repeat_count": 1.0, "routers_loss": 0.11099375039339066, "skip_count": 4.0, "step": 1096, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.384559359067735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.357421875, "learning_rate": 0.0009866562727942714, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 1607288.0, "repeat_count": 0.0, "routers_loss": 0.005733107682317495, "skip_count": 0.0, "step": 1098, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.396212672978878, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.546875, "learning_rate": 0.000986567120987093, "loss": 0.0272, "macro_f1": 0.5507246255874634, "num_tokens": 1610177.0, "repeat_count": 0.0, "routers_loss": 0.029117843136191368, "skip_count": 2.0, "step": 1100, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.407865986890021, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.466796875, "learning_rate": 0.000986477676405962, "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 1612804.0, "repeat_count": 0.0, "routers_loss": 0.006920638494193554, "skip_count": 0.0, "step": 1102, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.4195193008011655, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.40234375, "learning_rate": 0.0009863879391046983, "loss": 0.0276, "macro_f1": 1.0, "num_tokens": 1615500.0, "repeat_count": 1.0, "routers_loss": 0.014973138459026814, "skip_count": 2.0, "step": 1104, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.431172614712309, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4765625, "learning_rate": 0.0009862979091372981, "loss": 0.0381, "macro_f1": 0.32863849401474, "num_tokens": 1617983.0, "repeat_count": 1.0, "routers_loss": 0.03507894650101662, "skip_count": 0.0, "step": 1106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 6.442825928623452, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.4453125, "learning_rate": 0.0009862075865579337, "loss": 0.0327, "macro_f1": 0.5950249433517456, "num_tokens": 1621910.0, "repeat_count": 0.0, "routers_loss": 0.0259665809571743, "skip_count": 3.0, "step": 1108, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.454479242534596, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.45703125, "learning_rate": 0.0009861169714209534, "loss": 0.0389, "macro_f1": 0.32863849401474, "num_tokens": 1625153.0, "repeat_count": 0.0, "routers_loss": 0.026469359174370766, "skip_count": 1.0, "step": 1110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.466132556445739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.330078125, "learning_rate": 0.0009860260637808815, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 1627873.0, "repeat_count": 0.0, "routers_loss": 0.0036348679568618536, "skip_count": 0.0, "step": 1112, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 6.4777858703568825, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.6796875, "learning_rate": 0.0009859348636924183, "loss": 0.035, "macro_f1": 0.8839138746261597, "num_tokens": 1630657.0, "repeat_count": 1.0, "routers_loss": 0.028294481337070465, "skip_count": 2.0, "step": 1114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.489439184268027, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4921875, "learning_rate": 0.0009858433712104401, "loss": 0.021, "macro_f1": 0.32863849401474, "num_tokens": 1633452.0, "repeat_count": 0.0, "routers_loss": 0.008228227496147156, "skip_count": 1.0, "step": 1116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.421875, "learning_rate": 0.0009857515863899993, "loss": 0.0256, "macro_f1": 0.3333333432674408, "num_tokens": 1636138.0, "repeat_count": 0.0, "routers_loss": 0.0034616210032254457, "skip_count": 0.0, "step": 1118, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.4000000059604645, "avg_layers": 35.0, "epoch": 6.512745812090313, "f1_execute": 0.9523809552192688, "f1_repeat": 1.0, "f1_skip": 0.5714285969734192, "grad_norm": 0.46875, "learning_rate": 0.0009856595092863239, "loss": 0.0211, "macro_f1": 0.841269850730896, "num_tokens": 1638520.0, "repeat_count": 1.0, "routers_loss": 0.08931826800107956, "skip_count": 5.0, "step": 1120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.524399126001457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.000985567139954818, "loss": 0.026, "macro_f1": 0.3333333432674408, "num_tokens": 1641464.0, "repeat_count": 0.0, "routers_loss": 0.003908978775143623, "skip_count": 0.0, "step": 1122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4453125, "learning_rate": 0.0009854744784510616, "loss": 0.0194, "macro_f1": 0.3333333432674408, "num_tokens": 1643944.0, "repeat_count": 0.0, "routers_loss": 0.009978041052818298, "skip_count": 0.0, "step": 1124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 6.5477057538237435, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.447265625, "learning_rate": 0.00098538152483081, "loss": 0.0243, "macro_f1": 0.545751690864563, "num_tokens": 1647122.0, "repeat_count": 1.0, "routers_loss": 0.04983573034405708, "skip_count": 1.0, "step": 1126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 6.559359067734887, "f1_execute": 0.9687499403953552, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.392578125, "learning_rate": 0.000985288279149995, "loss": 0.0232, "macro_f1": 0.6086309552192688, "num_tokens": 1650260.0, "repeat_count": 1.0, "routers_loss": 0.03724395111203194, "skip_count": 4.0, "step": 1128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.571012381646031, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.515625, "learning_rate": 0.0009851947414647236, "loss": 0.0393, "macro_f1": 0.3333333432674408, "num_tokens": 1653841.0, "repeat_count": 0.0, "routers_loss": 0.0032431045547127724, "skip_count": 0.0, "step": 1130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 31.0, "epoch": 6.582665695557174, "f1_execute": 0.9836065173149109, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.609375, "learning_rate": 0.0009851009118312785, "loss": 0.0281, "macro_f1": 0.63089919090271, "num_tokens": 1656862.0, "repeat_count": 0.0, "routers_loss": 0.048034071922302246, "skip_count": 6.0, "step": 1132, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.594319009468317, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.49609375, "learning_rate": 0.0009850067903061188, "loss": 0.0292, "macro_f1": 0.3333333432674408, "num_tokens": 1659186.0, "repeat_count": 0.0, "routers_loss": 0.00550835719332099, "skip_count": 0.0, "step": 1134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.36328125, "learning_rate": 0.0009849123769458783, "loss": 0.0263, "macro_f1": 0.3333333432674408, "num_tokens": 1662180.0, "repeat_count": 0.0, "routers_loss": 0.004316565115004778, "skip_count": 0.0, "step": 1136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 6.617625637290605, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.515625, "learning_rate": 0.0009848176718073667, "loss": 0.0369, "macro_f1": 0.4901960790157318, "num_tokens": 1664825.0, "repeat_count": 0.0, "routers_loss": 0.050767943263053894, "skip_count": 3.0, "step": 1138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.629278951201748, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.90625, "learning_rate": 0.0009847226749475696, "loss": 0.0267, "macro_f1": 0.32863849401474, "num_tokens": 1667572.0, "repeat_count": 1.0, "routers_loss": 0.04237135127186775, "skip_count": 0.0, "step": 1140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.640932265112891, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.7890625, "learning_rate": 0.0009846273864236475, "loss": 0.0481, "macro_f1": 0.5507246255874634, "num_tokens": 1670308.0, "repeat_count": 0.0, "routers_loss": 0.022679375484585762, "skip_count": 2.0, "step": 1142, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.652585579024035, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.421875, "learning_rate": 0.0009845318062929372, "loss": 0.0217, "macro_f1": 0.32863849401474, "num_tokens": 1672913.0, "repeat_count": 1.0, "routers_loss": 0.022464348003268242, "skip_count": 0.0, "step": 1144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.664238892935178, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.55859375, "learning_rate": 0.0009844359346129503, "loss": 0.0322, "macro_f1": 0.3333333432674408, "num_tokens": 1675758.0, "repeat_count": 0.0, "routers_loss": 0.006146182771772146, "skip_count": 0.0, "step": 1146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 6.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.75390625, "learning_rate": 0.0009843397714413744, "loss": 0.0421, "macro_f1": 0.6666666865348816, "num_tokens": 1678297.0, "repeat_count": 0.0, "routers_loss": 0.008317895233631134, "skip_count": 3.0, "step": 1148, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.687545520757466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.58984375, "learning_rate": 0.0009842433168360718, "loss": 0.027, "macro_f1": 0.3333333432674408, "num_tokens": 1681128.0, "repeat_count": 0.0, "routers_loss": 0.008025613613426685, "skip_count": 0.0, "step": 1150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.699198834668609, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.66796875, "learning_rate": 0.0009841465708550806, "loss": 0.0304, "macro_f1": 0.3188405930995941, "num_tokens": 1683814.0, "repeat_count": 0.0, "routers_loss": 0.07148655503988266, "skip_count": 2.0, "step": 1152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.710852148579752, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26171875, "learning_rate": 0.0009840495335566141, "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 1686738.0, "repeat_count": 0.0, "routers_loss": 0.005066995974630117, "skip_count": 0.0, "step": 1154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.722505462490896, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5078125, "learning_rate": 0.0009839522049990613, "loss": 0.0357, "macro_f1": 0.3188405930995941, "num_tokens": 1689262.0, "repeat_count": 1.0, "routers_loss": 0.038943249732255936, "skip_count": 1.0, "step": 1156, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.734158776402039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2412109375, "learning_rate": 0.0009838545852409856, "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 1692006.0, "repeat_count": 0.0, "routers_loss": 0.004280155058950186, "skip_count": 0.0, "step": 1158, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 6.745812090313183, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.62890625, "learning_rate": 0.0009837566743411265, "loss": 0.0292, "macro_f1": 1.0, "num_tokens": 1694870.0, "repeat_count": 3.0, "routers_loss": 0.006653112359344959, "skip_count": 4.0, "step": 1160, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.0009836584723583982, "loss": 0.0228, "macro_f1": 0.3333333432674408, "num_tokens": 1697643.0, "repeat_count": 0.0, "routers_loss": 0.0012892537051811814, "skip_count": 0.0, "step": 1162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.416015625, "learning_rate": 0.00098355997935189, "loss": 0.0191, "macro_f1": 0.3333333432674408, "num_tokens": 1700641.0, "repeat_count": 0.0, "routers_loss": 0.002088136039674282, "skip_count": 0.0, "step": 1164, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 6.780772032046613, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.4921875, "learning_rate": 0.0009834611953808666, "loss": 0.0327, "macro_f1": 1.0, "num_tokens": 1703691.0, "repeat_count": 1.0, "routers_loss": 0.012973080389201641, "skip_count": 1.0, "step": 1166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 6.792425345957756, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.6875, "learning_rate": 0.0009833621205047676, "loss": 0.028, "macro_f1": 0.545751690864563, "num_tokens": 1706233.0, "repeat_count": 0.0, "routers_loss": 0.02272757887840271, "skip_count": 2.0, "step": 1168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.8040786598689005, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.41796875, "learning_rate": 0.0009832627547832077, "loss": 0.0215, "macro_f1": 0.32863849401474, "num_tokens": 1708826.0, "repeat_count": 1.0, "routers_loss": 0.03785425424575806, "skip_count": 0.0, "step": 1170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.815731973780044, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.68359375, "learning_rate": 0.0009831630982759766, "loss": 0.0363, "macro_f1": 0.5507246255874634, "num_tokens": 1711874.0, "repeat_count": 0.0, "routers_loss": 0.019498901441693306, "skip_count": 2.0, "step": 1172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.827385287691187, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.41796875, "learning_rate": 0.0009830631510430387, "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 1714379.0, "repeat_count": 0.0, "routers_loss": 0.005248184781521559, "skip_count": 0.0, "step": 1174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.839038601602331, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.5859375, "learning_rate": 0.0009829629131445341, "loss": 0.0272, "macro_f1": 0.5507246255874634, "num_tokens": 1717022.0, "repeat_count": 0.0, "routers_loss": 0.039058029651641846, "skip_count": 2.0, "step": 1176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5703125, "learning_rate": 0.0009828623846407768, "loss": 0.0308, "macro_f1": 0.3333333432674408, "num_tokens": 1719749.0, "repeat_count": 0.0, "routers_loss": 0.001950376550666988, "skip_count": 0.0, "step": 1178, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.862345229424617, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.47265625, "learning_rate": 0.0009827615655922566, "loss": 0.0284, "macro_f1": 0.32380953431129456, "num_tokens": 1722888.0, "repeat_count": 0.0, "routers_loss": 0.022712863981723785, "skip_count": 1.0, "step": 1180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.873998543335761, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5078125, "learning_rate": 0.0009826604560596373, "loss": 0.0308, "macro_f1": 0.32863849401474, "num_tokens": 1725783.0, "repeat_count": 0.0, "routers_loss": 0.03978192061185837, "skip_count": 1.0, "step": 1182, "text_loss": 0.0 }, { "acc_repeat": 0.75, "acc_skip": 1.0, "avg_layers": 38.0, "epoch": 6.885651857246905, "f1_execute": 0.9836065173149109, "f1_repeat": 0.75, "f1_skip": 0.6666666865348816, "grad_norm": 0.6171875, "learning_rate": 0.000982559056103758, "loss": 0.0357, "macro_f1": 0.8000911474227905, "num_tokens": 1728619.0, "repeat_count": 4.0, "routers_loss": 0.06328882277011871, "skip_count": 1.0, "step": 1184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.897305171158048, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.625, "learning_rate": 0.0009824573657856324, "loss": 0.0314, "macro_f1": 0.32863849401474, "num_tokens": 1731590.0, "repeat_count": 1.0, "routers_loss": 0.022621624171733856, "skip_count": 0.0, "step": 1186, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 6.908958485069191, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.349609375, "learning_rate": 0.000982355385166449, "loss": 0.0315, "macro_f1": 0.5507246255874634, "num_tokens": 1734478.0, "repeat_count": 0.0, "routers_loss": 0.0214589461684227, "skip_count": 2.0, "step": 1188, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 6.920611798980335, "f1_execute": 0.9354838132858276, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5714285373687744, "grad_norm": 0.369140625, "learning_rate": 0.0009822531143075706, "loss": 0.0228, "macro_f1": 0.7245264053344727, "num_tokens": 1737477.0, "repeat_count": 2.0, "routers_loss": 0.0679670050740242, "skip_count": 4.0, "step": 1190, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.9322651128914785, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.36328125, "learning_rate": 0.000982150553270535, "loss": 0.0318, "macro_f1": 0.32863849401474, "num_tokens": 1740906.0, "repeat_count": 0.0, "routers_loss": 0.011117727495729923, "skip_count": 1.0, "step": 1192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 6.943918426802622, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.4921875, "learning_rate": 0.000982047702117055, "loss": 0.0329, "macro_f1": 0.4901960790157318, "num_tokens": 1744155.0, "repeat_count": 0.0, "routers_loss": 0.05418570712208748, "skip_count": 3.0, "step": 1194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 6.955571740713766, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.51953125, "learning_rate": 0.0009819445609090174, "loss": 0.0247, "macro_f1": 0.3333333432674408, "num_tokens": 1746852.0, "repeat_count": 0.0, "routers_loss": 0.00886525772511959, "skip_count": 0.0, "step": 1196, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.967225054624909, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.63671875, "learning_rate": 0.0009818411297084831, "loss": 0.034, "macro_f1": 0.32863849401474, "num_tokens": 1749385.0, "repeat_count": 0.0, "routers_loss": 0.01730005443096161, "skip_count": 0.0, "step": 1198, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 6.978878368536052, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6875, "learning_rate": 0.0009817374085776887, "loss": 0.0369, "macro_f1": 0.3137255311012268, "num_tokens": 1752289.0, "repeat_count": 0.0, "routers_loss": 0.11223050206899643, "skip_count": 1.0, "step": 1200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 6.990531682447196, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.27734375, "learning_rate": 0.000981633397579044, "loss": 0.0238, "macro_f1": 0.32863849401474, "num_tokens": 1756145.0, "repeat_count": 0.0, "routers_loss": 0.020947668701410294, "skip_count": 0.0, "step": 1202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 7.0, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5078125, "learning_rate": 0.0009815290967751344, "loss": 0.0521, "macro_f1": 0.32380953431129456, "num_tokens": 1758512.0, "repeat_count": 0.0, "routers_loss": 0.03595234826207161, "skip_count": 1.0, "step": 1204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.011653313911143, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.33203125, "learning_rate": 0.0009814245062287187, "loss": 0.0261, "macro_f1": 0.5507246255874634, "num_tokens": 1761200.0, "repeat_count": 0.0, "routers_loss": 0.020231328904628754, "skip_count": 2.0, "step": 1206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.023306627822287, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.419921875, "learning_rate": 0.0009813196260027306, "loss": 0.0227, "macro_f1": 0.661835789680481, "num_tokens": 1764150.0, "repeat_count": 1.0, "routers_loss": 0.04087809845805168, "skip_count": 1.0, "step": 1208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 7.0349599417334305, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.3671875, "learning_rate": 0.000981214456160278, "loss": 0.0279, "macro_f1": 0.5454546213150024, "num_tokens": 1766836.0, "repeat_count": 0.0, "routers_loss": 0.041556306183338165, "skip_count": 3.0, "step": 1210, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.046613255644574, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.453125, "learning_rate": 0.0009811089967646427, "loss": 0.0186, "macro_f1": 0.32380953431129456, "num_tokens": 1770647.0, "repeat_count": 1.0, "routers_loss": 0.038625188171863556, "skip_count": 1.0, "step": 1212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 7.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.236328125, "learning_rate": 0.0009810032478792812, "loss": 0.0149, "macro_f1": 0.6666666865348816, "num_tokens": 1773445.0, "repeat_count": 0.0, "routers_loss": 0.013285388238728046, "skip_count": 3.0, "step": 1214, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.069919883466861, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33203125, "learning_rate": 0.0009808972095678241, "loss": 0.0187, "macro_f1": 0.32863849401474, "num_tokens": 1775948.0, "repeat_count": 0.0, "routers_loss": 0.010309557430446148, "skip_count": 1.0, "step": 1216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.000980790881894076, "loss": 0.0208, "macro_f1": 0.3333333432674408, "num_tokens": 1778487.0, "repeat_count": 0.0, "routers_loss": 0.004950762260705233, "skip_count": 0.0, "step": 1218, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.0932265112891475, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.45703125, "learning_rate": 0.0009806842649220159, "loss": 0.029, "macro_f1": 0.5507246255874634, "num_tokens": 1781670.0, "repeat_count": 0.0, "routers_loss": 0.025663692504167557, "skip_count": 2.0, "step": 1220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 7.104879825200292, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.50390625, "learning_rate": 0.000980577358715796, "loss": 0.0216, "macro_f1": 0.6615384817123413, "num_tokens": 1784321.0, "repeat_count": 1.0, "routers_loss": 0.025694476440548897, "skip_count": 3.0, "step": 1222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.116533139111435, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2412109375, "learning_rate": 0.0009804701633397439, "loss": 0.02, "macro_f1": 0.5507246255874634, "num_tokens": 1787228.0, "repeat_count": 0.0, "routers_loss": 0.022910989820957184, "skip_count": 2.0, "step": 1224, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 7.128186453022578, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.416015625, "learning_rate": 0.0009803626788583602, "loss": 0.0192, "macro_f1": 0.5950249433517456, "num_tokens": 1789922.0, "repeat_count": 0.0, "routers_loss": 0.037691593170166016, "skip_count": 3.0, "step": 1226, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.419921875, "learning_rate": 0.0009802549053363199, "loss": 0.0219, "macro_f1": 0.6666666865348816, "num_tokens": 1792955.0, "repeat_count": 0.0, "routers_loss": 0.011994478292763233, "skip_count": 2.0, "step": 1228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62109375, "learning_rate": 0.0009801468428384716, "loss": 0.025, "macro_f1": 0.3333333432674408, "num_tokens": 1795781.0, "repeat_count": 0.0, "routers_loss": 0.007692502811551094, "skip_count": 0.0, "step": 1230, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.75, "avg_layers": 35.0, "epoch": 7.163146394756009, "f1_execute": 0.9491525292396545, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 0.419921875, "learning_rate": 0.000980038491429838, "loss": 0.0189, "macro_f1": 0.8243207335472107, "num_tokens": 1799386.0, "repeat_count": 4.0, "routers_loss": 0.08312572538852692, "skip_count": 4.0, "step": 1232, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 7.174799708667152, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.416015625, "learning_rate": 0.0009799298511756157, "loss": 0.0209, "macro_f1": 0.6666666865348816, "num_tokens": 1802028.0, "repeat_count": 1.0, "routers_loss": 0.007664306554943323, "skip_count": 0.0, "step": 1234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 7.186453022578296, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.27734375, "learning_rate": 0.0009798209221411748, "loss": 0.0192, "macro_f1": 0.4901960790157318, "num_tokens": 1804626.0, "repeat_count": 0.0, "routers_loss": 0.016113989055156708, "skip_count": 3.0, "step": 1236, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.198106336489439, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.29296875, "learning_rate": 0.0009797117043920593, "loss": 0.0166, "macro_f1": 1.0, "num_tokens": 1807301.0, "repeat_count": 1.0, "routers_loss": 0.0036687562242150307, "skip_count": 2.0, "step": 1238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.209759650400582, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0009796021979939873, "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 1809959.0, "repeat_count": 0.0, "routers_loss": 0.001701285713352263, "skip_count": 0.0, "step": 1240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.221412964311726, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009794924030128503, "loss": 0.0203, "macro_f1": 0.3333333432674408, "num_tokens": 1813089.0, "repeat_count": 0.0, "routers_loss": 0.0017457931535318494, "skip_count": 0.0, "step": 1242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3984375, "learning_rate": 0.0009793823195147129, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 1815908.0, "repeat_count": 0.0, "routers_loss": 0.0031612925231456757, "skip_count": 0.0, "step": 1244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.244719592134013, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.396484375, "learning_rate": 0.0009792719475658143, "loss": 0.0174, "macro_f1": 0.32380953431129456, "num_tokens": 1818647.0, "repeat_count": 0.0, "routers_loss": 0.045140381902456284, "skip_count": 2.0, "step": 1246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.256372906045157, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5703125, "learning_rate": 0.0009791612872325666, "loss": 0.0253, "macro_f1": 0.3333333432674408, "num_tokens": 1822534.0, "repeat_count": 0.0, "routers_loss": 0.0034986548125743866, "skip_count": 0.0, "step": 1248, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.2680262199563, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.609375, "learning_rate": 0.0009790503385815558, "loss": 0.0182, "macro_f1": 0.9280423521995544, "num_tokens": 1825039.0, "repeat_count": 2.0, "routers_loss": 0.02031663991510868, "skip_count": 2.0, "step": 1250, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.279679533867443, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.37109375, "learning_rate": 0.0009789391016795407, "loss": 0.0181, "macro_f1": 0.6666666865348816, "num_tokens": 1828100.0, "repeat_count": 0.0, "routers_loss": 0.0050806389190256596, "skip_count": 1.0, "step": 1252, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.2913328477785875, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.265625, "learning_rate": 0.0009788275765934544, "loss": 0.0244, "macro_f1": 0.6666666865348816, "num_tokens": 1831014.0, "repeat_count": 0.0, "routers_loss": 0.00407462427392602, "skip_count": 1.0, "step": 1254, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.302986161689731, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3828125, "learning_rate": 0.0009787157633904032, "loss": 0.0206, "macro_f1": 0.3333333432674408, "num_tokens": 1833805.0, "repeat_count": 0.0, "routers_loss": 0.009141999296844006, "skip_count": 0.0, "step": 1256, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.314639475600874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.26171875, "learning_rate": 0.000978603662137666, "loss": 0.0223, "macro_f1": 1.0, "num_tokens": 1836981.0, "repeat_count": 1.0, "routers_loss": 0.00924980454146862, "skip_count": 2.0, "step": 1258, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.326292789512017, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.443359375, "learning_rate": 0.0009784912729026965, "loss": 0.0271, "macro_f1": 0.32863849401474, "num_tokens": 1840182.0, "repeat_count": 0.0, "routers_loss": 0.027851231396198273, "skip_count": 1.0, "step": 1260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.337946103423161, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.390625, "learning_rate": 0.00097837859575312, "loss": 0.0199, "macro_f1": 0.6666666865348816, "num_tokens": 1843038.0, "repeat_count": 0.0, "routers_loss": 0.00745333731174469, "skip_count": 2.0, "step": 1262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 7.349599417334304, "f1_execute": 0.9375, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.39453125, "learning_rate": 0.0009782656307567364, "loss": 0.0216, "macro_f1": 0.5347222685813904, "num_tokens": 1845882.0, "repeat_count": 2.0, "routers_loss": 0.16641445457935333, "skip_count": 4.0, "step": 1264, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 7.361252731245448, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.337890625, "learning_rate": 0.0009781523779815178, "loss": 0.0233, "macro_f1": 0.661835789680481, "num_tokens": 1849191.0, "repeat_count": 1.0, "routers_loss": 0.03648202121257782, "skip_count": 1.0, "step": 1266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 7.372906045156592, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.33984375, "learning_rate": 0.0009780388374956103, "loss": 0.025, "macro_f1": 0.5950249433517456, "num_tokens": 1852770.0, "repeat_count": 0.0, "routers_loss": 0.021321328356862068, "skip_count": 3.0, "step": 1268, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.384559359067735, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.27734375, "learning_rate": 0.0009779250093673324, "loss": 0.0239, "macro_f1": 0.5507246255874634, "num_tokens": 1856867.0, "repeat_count": 0.0, "routers_loss": 0.039546482264995575, "skip_count": 2.0, "step": 1270, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.396212672978878, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.322265625, "learning_rate": 0.0009778108936651761, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 1859456.0, "repeat_count": 0.0, "routers_loss": 0.0028854478150606155, "skip_count": 0.0, "step": 1272, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.407865986890021, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.458984375, "learning_rate": 0.0009776964904578064, "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 1862259.0, "repeat_count": 0.0, "routers_loss": 0.0018217323813587427, "skip_count": 0.0, "step": 1274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.4195193008011655, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.609375, "learning_rate": 0.0009775817998140615, "loss": 0.0301, "macro_f1": 0.6666666865348816, "num_tokens": 1864940.0, "repeat_count": 0.0, "routers_loss": 0.009844623506069183, "skip_count": 1.0, "step": 1276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.431172614712309, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.46875, "learning_rate": 0.000977466821802952, "loss": 0.022, "macro_f1": 0.661835789680481, "num_tokens": 1867556.0, "repeat_count": 1.0, "routers_loss": 0.09527061879634857, "skip_count": 1.0, "step": 1278, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.442825928623452, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.265625, "learning_rate": 0.0009773515564936617, "loss": 0.0206, "macro_f1": 0.6666666865348816, "num_tokens": 1870864.0, "repeat_count": 0.0, "routers_loss": 0.0036953177768737078, "skip_count": 1.0, "step": 1280, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.43359375, "learning_rate": 0.0009772360039555473, "loss": 0.0161, "macro_f1": 0.6666666865348816, "num_tokens": 1873611.0, "repeat_count": 0.0, "routers_loss": 0.009172399528324604, "skip_count": 1.0, "step": 1282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.466132556445739, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5, "learning_rate": 0.0009771201642581385, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 1876875.0, "repeat_count": 0.0, "routers_loss": 0.010612248443067074, "skip_count": 0.0, "step": 1284, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 7.4777858703568825, "f1_execute": 0.9846153855323792, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.330078125, "learning_rate": 0.0009770040374711374, "loss": 0.0242, "macro_f1": 0.928205132484436, "num_tokens": 1879360.0, "repeat_count": 3.0, "routers_loss": 0.02213934063911438, "skip_count": 1.0, "step": 1286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.489439184268027, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.61328125, "learning_rate": 0.000976887623664419, "loss": 0.0185, "macro_f1": 0.32863849401474, "num_tokens": 1882126.0, "repeat_count": 0.0, "routers_loss": 0.016779758036136627, "skip_count": 1.0, "step": 1288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.265625, "learning_rate": 0.000976770922908031, "loss": 0.021, "macro_f1": 0.3333333432674408, "num_tokens": 1885071.0, "repeat_count": 0.0, "routers_loss": 0.011452989652752876, "skip_count": 0.0, "step": 1290, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.512745812090313, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.26171875, "learning_rate": 0.0009766539352721942, "loss": 0.0199, "macro_f1": 0.5507246255874634, "num_tokens": 1888415.0, "repeat_count": 0.0, "routers_loss": 0.062072914093732834, "skip_count": 2.0, "step": 1292, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 7.524399126001457, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.45703125, "learning_rate": 0.0009765366608273014, "loss": 0.0236, "macro_f1": 0.928205132484436, "num_tokens": 1891299.0, "repeat_count": 1.0, "routers_loss": 0.02751857414841652, "skip_count": 3.0, "step": 1294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.244140625, "learning_rate": 0.0009764190996439181, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 1894008.0, "repeat_count": 0.0, "routers_loss": 0.0032395985908806324, "skip_count": 1.0, "step": 1296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.5477057538237435, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.57421875, "learning_rate": 0.0009763012517927825, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 1897091.0, "repeat_count": 0.0, "routers_loss": 0.004508689045906067, "skip_count": 0.0, "step": 1298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 7.559359067734887, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.42578125, "learning_rate": 0.0009761831173448052, "loss": 0.0232, "macro_f1": 0.5898990035057068, "num_tokens": 1900227.0, "repeat_count": 1.0, "routers_loss": 0.07351741939783096, "skip_count": 3.0, "step": 1300, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.571012381646031, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.62890625, "learning_rate": 0.0009760646963710693, "loss": 0.0195, "macro_f1": 0.9280423521995544, "num_tokens": 1902853.0, "repeat_count": 2.0, "routers_loss": 0.03688189014792442, "skip_count": 2.0, "step": 1302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 7.582665695557174, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.0009759459889428304, "loss": 0.0142, "macro_f1": 0.32863849401474, "num_tokens": 1905516.0, "repeat_count": 0.0, "routers_loss": 0.01452261209487915, "skip_count": 0.0, "step": 1304, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.594319009468317, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.5625, "learning_rate": 0.0009758269951315163, "loss": 0.0207, "macro_f1": 0.5507246255874634, "num_tokens": 1908139.0, "repeat_count": 0.0, "routers_loss": 0.016085892915725708, "skip_count": 2.0, "step": 1306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.605972323379461, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.84765625, "learning_rate": 0.0009757077150087271, "loss": 0.0236, "macro_f1": 0.32863849401474, "num_tokens": 1911501.0, "repeat_count": 1.0, "routers_loss": 0.010112724266946316, "skip_count": 0.0, "step": 1308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.380859375, "learning_rate": 0.0009755881486462353, "loss": 0.0226, "macro_f1": 0.3333333432674408, "num_tokens": 1914339.0, "repeat_count": 0.0, "routers_loss": 0.00307181547395885, "skip_count": 0.0, "step": 1310, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 7.629278951201748, "f1_execute": 0.9677419066429138, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 0.6171875, "learning_rate": 0.0009754682961159854, "loss": 0.0346, "macro_f1": 0.8305171728134155, "num_tokens": 1917202.0, "repeat_count": 2.0, "routers_loss": 0.036693621426820755, "skip_count": 4.0, "step": 1312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3515625, "learning_rate": 0.0009753481574900947, "loss": 0.0198, "macro_f1": 0.3333333432674408, "num_tokens": 1919988.0, "repeat_count": 0.0, "routers_loss": 0.007889299653470516, "skip_count": 0.0, "step": 1314, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.652585579024035, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.458984375, "learning_rate": 0.0009752277328408516, "loss": 0.0196, "macro_f1": 0.6666666865348816, "num_tokens": 1922727.0, "repeat_count": 0.0, "routers_loss": 0.012916995212435722, "skip_count": 2.0, "step": 1316, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 7.664238892935178, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.64453125, "learning_rate": 0.0009751070222407179, "loss": 0.0192, "macro_f1": 0.32380953431129456, "num_tokens": 1925726.0, "repeat_count": 0.0, "routers_loss": 0.05555571988224983, "skip_count": 1.0, "step": 1318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.675892206846322, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.447265625, "learning_rate": 0.0009749860257623263, "loss": 0.0266, "macro_f1": 0.6616915464401245, "num_tokens": 1928890.0, "repeat_count": 1.0, "routers_loss": 0.025641359388828278, "skip_count": 2.0, "step": 1320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 35.0, "epoch": 7.687545520757466, "f1_execute": 0.9552239179611206, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 0.30859375, "learning_rate": 0.0009748647434784822, "loss": 0.021, "macro_f1": 0.4517413079738617, "num_tokens": 1931830.0, "repeat_count": 0.0, "routers_loss": 0.02731768786907196, "skip_count": 4.0, "step": 1322, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 7.699198834668609, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.4921875, "learning_rate": 0.0009747431754621627, "loss": 0.0301, "macro_f1": 1.0, "num_tokens": 1934452.0, "repeat_count": 1.0, "routers_loss": 0.0015580158215016127, "skip_count": 1.0, "step": 1324, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.710852148579752, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2177734375, "learning_rate": 0.000974621321786517, "loss": 0.0212, "macro_f1": 0.6666666865348816, "num_tokens": 1937386.0, "repeat_count": 0.0, "routers_loss": 0.0076023549772799015, "skip_count": 2.0, "step": 1326, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 7.722505462490896, "f1_execute": 0.9696969985961914, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.76171875, "learning_rate": 0.0009744991825248659, "loss": 0.0367, "macro_f1": 0.767676830291748, "num_tokens": 1940046.0, "repeat_count": 2.0, "routers_loss": 0.1392577439546585, "skip_count": 2.0, "step": 1328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 7.734158776402039, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.427734375, "learning_rate": 0.0009743767577507026, "loss": 0.0222, "macro_f1": 0.32380953431129456, "num_tokens": 1942922.0, "repeat_count": 0.0, "routers_loss": 0.03869049251079559, "skip_count": 1.0, "step": 1330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.745812090313183, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.52734375, "learning_rate": 0.0009742540475376913, "loss": 0.0206, "macro_f1": 0.5507246255874634, "num_tokens": 1945507.0, "repeat_count": 0.0, "routers_loss": 0.011766700074076653, "skip_count": 2.0, "step": 1332, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 7.757465404224327, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.71484375, "learning_rate": 0.0009741310519596686, "loss": 0.0281, "macro_f1": 0.32863849401474, "num_tokens": 1948303.0, "repeat_count": 0.0, "routers_loss": 0.019663140177726746, "skip_count": 0.0, "step": 1334, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 7.76911871813547, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.2734375, "learning_rate": 0.0009740077710906426, "loss": 0.0205, "macro_f1": 0.4901960790157318, "num_tokens": 1951203.0, "repeat_count": 0.0, "routers_loss": 0.03460928425192833, "skip_count": 2.0, "step": 1336, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.431640625, "learning_rate": 0.0009738842050047929, "loss": 0.0252, "macro_f1": 0.3333333432674408, "num_tokens": 1954248.0, "repeat_count": 0.0, "routers_loss": 0.00594610208645463, "skip_count": 0.0, "step": 1338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.792425345957756, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.9609375, "learning_rate": 0.000973760353776471, "loss": 0.0351, "macro_f1": 0.6616915464401245, "num_tokens": 1957673.0, "repeat_count": 1.0, "routers_loss": 0.01699882373213768, "skip_count": 2.0, "step": 1340, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 7.8040786598689005, "f1_execute": 0.9705882668495178, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.43359375, "learning_rate": 0.0009736362174801998, "loss": 0.0205, "macro_f1": 0.656862735748291, "num_tokens": 1960440.0, "repeat_count": 1.0, "routers_loss": 0.023291416466236115, "skip_count": 2.0, "step": 1342, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.815731973780044, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.28125, "learning_rate": 0.0009735117961906739, "loss": 0.0216, "macro_f1": 0.6666666865348816, "num_tokens": 1963946.0, "repeat_count": 0.0, "routers_loss": 0.008848550729453564, "skip_count": 1.0, "step": 1344, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 7.827385287691187, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.318359375, "learning_rate": 0.000973387089982759, "loss": 0.0261, "macro_f1": 0.6666666865348816, "num_tokens": 1967322.0, "repeat_count": 1.0, "routers_loss": 0.005059624556452036, "skip_count": 0.0, "step": 1346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 7.839038601602331, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.5078125, "learning_rate": 0.0009732620989314927, "loss": 0.0362, "macro_f1": 0.4901960790157318, "num_tokens": 1969799.0, "repeat_count": 0.0, "routers_loss": 0.02909304015338421, "skip_count": 3.0, "step": 1348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 7.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.494140625, "learning_rate": 0.0009731368231120836, "loss": 0.0214, "macro_f1": 0.6666666865348816, "num_tokens": 1973484.0, "repeat_count": 0.0, "routers_loss": 0.00785919837653637, "skip_count": 2.0, "step": 1350, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 7.862345229424617, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.265625, "learning_rate": 0.0009730112625999122, "loss": 0.0181, "macro_f1": 0.5507246255874634, "num_tokens": 1976409.0, "repeat_count": 0.0, "routers_loss": 0.06568765640258789, "skip_count": 2.0, "step": 1352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 7.873998543335761, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.48828125, "learning_rate": 0.0009728854174705295, "loss": 0.0273, "macro_f1": 0.3333333432674408, "num_tokens": 1979026.0, "repeat_count": 1.0, "routers_loss": 0.01053231954574585, "skip_count": 0.0, "step": 1354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.885651857246905, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.0009727592877996585, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 1981288.0, "repeat_count": 0.0, "routers_loss": 0.002565359463915229, "skip_count": 0.0, "step": 1356, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 7.897305171158048, "f1_execute": 0.9696969985961914, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.388671875, "learning_rate": 0.0009726328736631929, "loss": 0.0249, "macro_f1": 0.8232323527336121, "num_tokens": 1983621.0, "repeat_count": 1.0, "routers_loss": 0.039477020502090454, "skip_count": 3.0, "step": 1358, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.908958485069191, "f1_execute": 0.9696969985961914, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 0.369140625, "learning_rate": 0.000972506175137198, "loss": 0.0252, "macro_f1": 0.767676830291748, "num_tokens": 1986512.0, "repeat_count": 2.0, "routers_loss": 0.06426596641540527, "skip_count": 1.0, "step": 1360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.427734375, "learning_rate": 0.0009723791922979098, "loss": 0.0227, "macro_f1": 0.3333333432674408, "num_tokens": 1989900.0, "repeat_count": 0.0, "routers_loss": 0.0022974186576902866, "skip_count": 0.0, "step": 1362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.9322651128914785, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.337890625, "learning_rate": 0.0009722519252217357, "loss": 0.025, "macro_f1": 0.32863849401474, "num_tokens": 1992587.0, "repeat_count": 0.0, "routers_loss": 0.010398400947451591, "skip_count": 1.0, "step": 1364, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 7.943918426802622, "f1_execute": 0.9836065173149109, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.5234375, "learning_rate": 0.0009721243739852541, "loss": 0.0261, "macro_f1": 0.9278688430786133, "num_tokens": 1995121.0, "repeat_count": 3.0, "routers_loss": 0.01728745549917221, "skip_count": 3.0, "step": 1366, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.2857142984867096, "avg_layers": 35.0, "epoch": 7.955571740713766, "f1_execute": 0.9180328249931335, "f1_repeat": 1.0, "f1_skip": 0.444444477558136, "grad_norm": 0.4375, "learning_rate": 0.0009719965386652141, "loss": 0.021, "macro_f1": 0.7874924540519714, "num_tokens": 1997907.0, "repeat_count": 1.0, "routers_loss": 0.04845314845442772, "skip_count": 7.0, "step": 1368, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.967225054624909, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.39453125, "learning_rate": 0.0009718684193385359, "loss": 0.0229, "macro_f1": 0.3333333432674408, "num_tokens": 2001728.0, "repeat_count": 0.0, "routers_loss": 0.005543979350477457, "skip_count": 0.0, "step": 1370, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 7.978878368536052, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.0009717400160823106, "loss": 0.024, "macro_f1": 0.6666666865348816, "num_tokens": 2004547.0, "repeat_count": 1.0, "routers_loss": 0.008859404362738132, "skip_count": 0.0, "step": 1372, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 7.990531682447196, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.404296875, "learning_rate": 0.0009716113289738004, "loss": 0.0222, "macro_f1": 0.3333333432674408, "num_tokens": 2007544.0, "repeat_count": 0.0, "routers_loss": 0.009861774742603302, "skip_count": 0.0, "step": 1374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.376953125, "learning_rate": 0.0009714823580904379, "loss": 0.0204, "macro_f1": 0.3333333432674408, "num_tokens": 2009728.0, "repeat_count": 0.0, "routers_loss": 0.012705311179161072, "skip_count": 0.0, "step": 1376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.380859375, "learning_rate": 0.0009713531035098268, "loss": 0.0233, "macro_f1": 0.3333333432674408, "num_tokens": 2012115.0, "repeat_count": 0.0, "routers_loss": 0.003946088254451752, "skip_count": 0.0, "step": 1378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.023306627822286, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.32421875, "learning_rate": 0.000971223565309741, "loss": 0.0269, "macro_f1": 0.5507246255874634, "num_tokens": 2015473.0, "repeat_count": 0.0, "routers_loss": 0.0316404290497303, "skip_count": 2.0, "step": 1380, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.03495994173343, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2890625, "learning_rate": 0.0009710937435681253, "loss": 0.0235, "macro_f1": 0.545751690864563, "num_tokens": 2018208.0, "repeat_count": 1.0, "routers_loss": 0.04204981401562691, "skip_count": 2.0, "step": 1382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 8.046613255644575, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.38671875, "learning_rate": 0.0009709636383630955, "loss": 0.0191, "macro_f1": 0.5950249433517456, "num_tokens": 2021115.0, "repeat_count": 0.0, "routers_loss": 0.030135300010442734, "skip_count": 3.0, "step": 1384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.515625, "learning_rate": 0.0009708332497729377, "loss": 0.0242, "macro_f1": 0.6666666865348816, "num_tokens": 2023679.0, "repeat_count": 0.0, "routers_loss": 0.0057529425248503685, "skip_count": 2.0, "step": 1386, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6328125, "learning_rate": 0.0009707025778761081, "loss": 0.0242, "macro_f1": 0.3333333432674408, "num_tokens": 2026432.0, "repeat_count": 0.0, "routers_loss": 0.0052957683801651, "skip_count": 0.0, "step": 1388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.478515625, "learning_rate": 0.000970571622751234, "loss": 0.0137, "macro_f1": 0.3333333432674408, "num_tokens": 2028996.0, "repeat_count": 0.0, "routers_loss": 0.004670952912420034, "skip_count": 0.0, "step": 1390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.093226511289147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4375, "learning_rate": 0.0009704403844771128, "loss": 0.0157, "macro_f1": 0.3333333432674408, "num_tokens": 2032263.0, "repeat_count": 0.0, "routers_loss": 0.005081538576632738, "skip_count": 0.0, "step": 1392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.10487982520029, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0009703088631327121, "loss": 0.014, "macro_f1": 0.3333333432674408, "num_tokens": 2034571.0, "repeat_count": 0.0, "routers_loss": 0.0020935838110744953, "skip_count": 0.0, "step": 1394, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.116533139111436, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.216796875, "learning_rate": 0.0009701770587971706, "loss": 0.0208, "macro_f1": 0.5507246255874634, "num_tokens": 2037593.0, "repeat_count": 0.0, "routers_loss": 0.013668816536664963, "skip_count": 2.0, "step": 1396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.128186453022579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.333984375, "learning_rate": 0.0009700449715497961, "loss": 0.0191, "macro_f1": 0.6666666865348816, "num_tokens": 2040460.0, "repeat_count": 0.0, "routers_loss": 0.0015341010875999928, "skip_count": 1.0, "step": 1398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.0009699126014700676, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 2043019.0, "repeat_count": 0.0, "routers_loss": 0.0013127224519848824, "skip_count": 0.0, "step": 1400, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 8.151493080844865, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.4296875, "learning_rate": 0.000969779948637634, "loss": 0.0169, "macro_f1": 0.6666666865348816, "num_tokens": 2045488.0, "repeat_count": 1.0, "routers_loss": 0.005461358930915594, "skip_count": 0.0, "step": 1402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.163146394756009, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.38671875, "learning_rate": 0.000969647013132314, "loss": 0.0179, "macro_f1": 0.32380953431129456, "num_tokens": 2048719.0, "repeat_count": 0.0, "routers_loss": 0.05473482608795166, "skip_count": 2.0, "step": 1404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.174799708667152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.0009695137950340971, "loss": 0.0139, "macro_f1": 0.3333333432674408, "num_tokens": 2051904.0, "repeat_count": 0.0, "routers_loss": 0.0048774308525025845, "skip_count": 0.0, "step": 1406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33984375, "learning_rate": 0.0009693802944231421, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2054558.0, "repeat_count": 0.0, "routers_loss": 0.001617271569557488, "skip_count": 0.0, "step": 1408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.19810633648944, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.451171875, "learning_rate": 0.0009692465113797779, "loss": 0.0184, "macro_f1": 0.3333333432674408, "num_tokens": 2056935.0, "repeat_count": 0.0, "routers_loss": 0.0027236805763095617, "skip_count": 0.0, "step": 1410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 8.209759650400583, "f1_execute": 0.9375, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.62109375, "learning_rate": 0.0009691124459845042, "loss": 0.0219, "macro_f1": 0.5347222685813904, "num_tokens": 2060885.0, "repeat_count": 2.0, "routers_loss": 0.09331774711608887, "skip_count": 3.0, "step": 1412, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 8.221412964311726, "f1_execute": 0.9850746393203735, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1669921875, "learning_rate": 0.0009689780983179894, "loss": 0.0172, "macro_f1": 0.8839138746261597, "num_tokens": 2063998.0, "repeat_count": 2.0, "routers_loss": 0.042079973965883255, "skip_count": 1.0, "step": 1414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.23306627822287, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1669921875, "learning_rate": 0.0009688434684610725, "loss": 0.0187, "macro_f1": 0.5507246255874634, "num_tokens": 2066836.0, "repeat_count": 0.0, "routers_loss": 0.011139939539134502, "skip_count": 2.0, "step": 1416, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.244719592134013, "f1_execute": 0.970588207244873, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.74609375, "learning_rate": 0.0009687085564947619, "loss": 0.0252, "macro_f1": 0.656862735748291, "num_tokens": 2069542.0, "repeat_count": 1.0, "routers_loss": 0.07027164846658707, "skip_count": 1.0, "step": 1418, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.256372906045156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2412109375, "learning_rate": 0.0009685733625002363, "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 2072274.0, "repeat_count": 0.0, "routers_loss": 0.0012569413520395756, "skip_count": 1.0, "step": 1420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009684378865588434, "loss": 0.0138, "macro_f1": 0.3333333432674408, "num_tokens": 2074890.0, "repeat_count": 0.0, "routers_loss": 0.0025546562392264605, "skip_count": 0.0, "step": 1422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.279679533867444, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1884765625, "learning_rate": 0.0009683021287521011, "loss": 0.0193, "macro_f1": 0.5507246255874634, "num_tokens": 2077577.0, "repeat_count": 0.0, "routers_loss": 0.05489431694149971, "skip_count": 2.0, "step": 1424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2734375, "learning_rate": 0.0009681660891616966, "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 2080286.0, "repeat_count": 0.0, "routers_loss": 0.0044681113213300705, "skip_count": 0.0, "step": 1426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.30298616168973, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2734375, "learning_rate": 0.0009680297678694867, "loss": 0.0188, "macro_f1": 0.32863849401474, "num_tokens": 2083279.0, "repeat_count": 1.0, "routers_loss": 0.049587540328502655, "skip_count": 0.0, "step": 1428, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0009678931649574978, "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 2086299.0, "repeat_count": 0.0, "routers_loss": 0.0065501658245921135, "skip_count": 0.0, "step": 1430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.326292789512017, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6171875, "learning_rate": 0.0009677562805079257, "loss": 0.0174, "macro_f1": 0.32863849401474, "num_tokens": 2090101.0, "repeat_count": 0.0, "routers_loss": 0.05997999384999275, "skip_count": 1.0, "step": 1432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.357421875, "learning_rate": 0.0009676191146031354, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2092762.0, "repeat_count": 0.0, "routers_loss": 0.0015313896583393216, "skip_count": 0.0, "step": 1434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.0009674816673256619, "loss": 0.0112, "macro_f1": 0.3333333432674408, "num_tokens": 2095899.0, "repeat_count": 0.0, "routers_loss": 0.001808412023819983, "skip_count": 0.0, "step": 1436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.361252731245449, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.361328125, "learning_rate": 0.0009673439387582088, "loss": 0.0241, "macro_f1": 0.32863849401474, "num_tokens": 2098429.0, "repeat_count": 0.0, "routers_loss": 0.06954631954431534, "skip_count": 1.0, "step": 1438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.372906045156592, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.40625, "learning_rate": 0.0009672059289836492, "loss": 0.017, "macro_f1": 0.3333333432674408, "num_tokens": 2101290.0, "repeat_count": 0.0, "routers_loss": 0.009307604283094406, "skip_count": 0.0, "step": 1440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.384559359067735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.291015625, "learning_rate": 0.0009670676380850256, "loss": 0.016, "macro_f1": 0.6666666865348816, "num_tokens": 2104135.0, "repeat_count": 0.0, "routers_loss": 0.007539765909314156, "skip_count": 2.0, "step": 1442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 8.396212672978878, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3828125, "learning_rate": 0.0009669290661455492, "loss": 0.0161, "macro_f1": 0.32863849401474, "num_tokens": 2107082.0, "repeat_count": 0.0, "routers_loss": 0.02375837229192257, "skip_count": 0.0, "step": 1444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.407865986890021, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.0009667902132486009, "loss": 0.0196, "macro_f1": 0.3333333432674408, "num_tokens": 2110165.0, "repeat_count": 0.0, "routers_loss": 0.003084545023739338, "skip_count": 0.0, "step": 1446, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.419519300801165, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.47265625, "learning_rate": 0.0009666510794777301, "loss": 0.0265, "macro_f1": 0.32863849401474, "num_tokens": 2113681.0, "repeat_count": 0.0, "routers_loss": 0.02758648805320263, "skip_count": 1.0, "step": 1448, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.38671875, "learning_rate": 0.0009665116649166557, "loss": 0.0141, "macro_f1": 0.6666666865348816, "num_tokens": 2116258.0, "repeat_count": 0.0, "routers_loss": 0.00156206835526973, "skip_count": 2.0, "step": 1450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.442825928623453, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.34375, "learning_rate": 0.000966371969649265, "loss": 0.0206, "macro_f1": 0.661835789680481, "num_tokens": 2118887.0, "repeat_count": 1.0, "routers_loss": 0.021375581622123718, "skip_count": 1.0, "step": 1452, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.60546875, "learning_rate": 0.0009662319937596148, "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 2121298.0, "repeat_count": 0.0, "routers_loss": 0.0033357604406774044, "skip_count": 0.0, "step": 1454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.291015625, "learning_rate": 0.0009660917373319302, "loss": 0.0171, "macro_f1": 0.6666666865348816, "num_tokens": 2125037.0, "repeat_count": 0.0, "routers_loss": 0.007960156537592411, "skip_count": 2.0, "step": 1456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.7777777910232544, "avg_layers": 29.0, "epoch": 8.477785870356882, "f1_execute": 0.964285671710968, "f1_repeat": 0.0, "f1_skip": 0.875, "grad_norm": 0.41015625, "learning_rate": 0.0009659512004506057, "loss": 0.0187, "macro_f1": 0.613095223903656, "num_tokens": 2127716.0, "repeat_count": 0.0, "routers_loss": 0.017714491114020348, "skip_count": 9.0, "step": 1458, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33203125, "learning_rate": 0.000965810383200204, "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 2130289.0, "repeat_count": 0.0, "routers_loss": 0.00937635451555252, "skip_count": 0.0, "step": 1460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.501092498179169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.0009656692856654567, "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 2133395.0, "repeat_count": 0.0, "routers_loss": 0.007414102554321289, "skip_count": 0.0, "step": 1462, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.512745812090314, "f1_execute": 0.96875, "f1_repeat": 0.6666666865348816, "f1_skip": 0.800000011920929, "grad_norm": 0.55078125, "learning_rate": 0.0009655279079312642, "loss": 0.0201, "macro_f1": 0.8118056058883667, "num_tokens": 2135789.0, "repeat_count": 2.0, "routers_loss": 0.03921913355588913, "skip_count": 2.0, "step": 1464, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 8.524399126001457, "f1_execute": 0.9677419066429138, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 0.24609375, "learning_rate": 0.0009653862500826953, "loss": 0.0151, "macro_f1": 0.8305171728134155, "num_tokens": 2140086.0, "repeat_count": 2.0, "routers_loss": 0.04724593833088875, "skip_count": 4.0, "step": 1466, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 8.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2392578125, "learning_rate": 0.0009652443122049874, "loss": 0.0197, "macro_f1": 0.6666666865348816, "num_tokens": 2142784.0, "repeat_count": 1.0, "routers_loss": 0.004312635399401188, "skip_count": 0.0, "step": 1468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.547705753823744, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.44921875, "learning_rate": 0.0009651020943835465, "loss": 0.02, "macro_f1": 0.32863849401474, "num_tokens": 2145947.0, "repeat_count": 0.0, "routers_loss": 0.03540536388754845, "skip_count": 1.0, "step": 1470, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 8.559359067734887, "f1_execute": 0.9552239179611206, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.34765625, "learning_rate": 0.0009649595967039469, "loss": 0.0186, "macro_f1": 0.5406302213668823, "num_tokens": 2148282.0, "repeat_count": 2.0, "routers_loss": 0.08795808255672455, "skip_count": 2.0, "step": 1472, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 33.0, "epoch": 8.57101238164603, "f1_execute": 0.9824560880661011, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.2578125, "learning_rate": 0.0009648168192519316, "loss": 0.0229, "macro_f1": 0.9638490676879883, "num_tokens": 2151855.0, "repeat_count": 2.0, "routers_loss": 0.012084978632628918, "skip_count": 6.0, "step": 1474, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.44921875, "learning_rate": 0.0009646737621134112, "loss": 0.0219, "macro_f1": 0.3333333432674408, "num_tokens": 2154730.0, "repeat_count": 0.0, "routers_loss": 0.0026112389750778675, "skip_count": 0.0, "step": 1476, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.594319009468318, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.546875, "learning_rate": 0.0009645304253744656, "loss": 0.0187, "macro_f1": 0.32863849401474, "num_tokens": 2157410.0, "repeat_count": 0.0, "routers_loss": 0.009926875121891499, "skip_count": 1.0, "step": 1478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.605972323379461, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.35546875, "learning_rate": 0.0009643868091213421, "loss": 0.0246, "macro_f1": 0.5507246255874634, "num_tokens": 2160301.0, "repeat_count": 0.0, "routers_loss": 0.023245198652148247, "skip_count": 2.0, "step": 1480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.19140625, "learning_rate": 0.0009642429134404568, "loss": 0.0128, "macro_f1": 0.6666666865348816, "num_tokens": 2164398.0, "repeat_count": 0.0, "routers_loss": 0.005960884038358927, "skip_count": 2.0, "step": 1482, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.629278951201748, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.388671875, "learning_rate": 0.0009640987384183934, "loss": 0.0179, "macro_f1": 0.32863849401474, "num_tokens": 2167404.0, "repeat_count": 1.0, "routers_loss": 0.027414554730057716, "skip_count": 0.0, "step": 1484, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5078125, "learning_rate": 0.0009639542841419041, "loss": 0.0167, "macro_f1": 0.3333333432674408, "num_tokens": 2170088.0, "repeat_count": 0.0, "routers_loss": 0.0036883570719510317, "skip_count": 0.0, "step": 1486, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.652585579024034, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.4140625, "learning_rate": 0.000963809550697909, "loss": 0.0185, "macro_f1": 0.5507246255874634, "num_tokens": 2172831.0, "repeat_count": 0.0, "routers_loss": 0.010255473665893078, "skip_count": 2.0, "step": 1488, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.66423889293518, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.359375, "learning_rate": 0.0009636645381734959, "loss": 0.0218, "macro_f1": 0.8837606906890869, "num_tokens": 2175353.0, "repeat_count": 2.0, "routers_loss": 0.09622028470039368, "skip_count": 2.0, "step": 1490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1240234375, "learning_rate": 0.0009635192466559211, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 2179012.0, "repeat_count": 0.0, "routers_loss": 0.0077851139940321445, "skip_count": 2.0, "step": 1492, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 8.687545520757466, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.421875, "learning_rate": 0.0009633736762326083, "loss": 0.0167, "macro_f1": 0.6666666865348816, "num_tokens": 2181860.0, "repeat_count": 1.0, "routers_loss": 0.0020079459063708782, "skip_count": 0.0, "step": 1494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0009632278269911492, "loss": 0.0158, "macro_f1": 0.3333333432674408, "num_tokens": 2185631.0, "repeat_count": 0.0, "routers_loss": 0.004729453008621931, "skip_count": 0.0, "step": 1496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 8.710852148579752, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.474609375, "learning_rate": 0.0009630816990193032, "loss": 0.0166, "macro_f1": 0.3333333432674408, "num_tokens": 2188286.0, "repeat_count": 0.0, "routers_loss": 0.033531442284584045, "skip_count": 1.0, "step": 1498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.0009629352924049974, "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2190692.0, "repeat_count": 0.0, "routers_loss": 0.005399714224040508, "skip_count": 0.0, "step": 1500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.734158776402039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.23828125, "learning_rate": 0.0009627886072363268, "loss": 0.0139, "macro_f1": 0.6666666865348816, "num_tokens": 2194181.0, "repeat_count": 0.0, "routers_loss": 0.01003585197031498, "skip_count": 2.0, "step": 1502, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5234375, "learning_rate": 0.0009626416436015538, "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2196843.0, "repeat_count": 0.0, "routers_loss": 0.005224708467721939, "skip_count": 0.0, "step": 1504, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 8.757465404224327, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.345703125, "learning_rate": 0.000962494401589108, "loss": 0.0261, "macro_f1": 1.0, "num_tokens": 2199793.0, "repeat_count": 1.0, "routers_loss": 0.0016851952532306314, "skip_count": 1.0, "step": 1506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.76911871813547, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0009623468812875875, "loss": 0.0108, "macro_f1": 0.32863849401474, "num_tokens": 2202707.0, "repeat_count": 0.0, "routers_loss": 0.011953038163483143, "skip_count": 1.0, "step": 1508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.33203125, "learning_rate": 0.0009621990827857568, "loss": 0.0189, "macro_f1": 0.6666666865348816, "num_tokens": 2206032.0, "repeat_count": 0.0, "routers_loss": 0.0072102416306734085, "skip_count": 2.0, "step": 1510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2236328125, "learning_rate": 0.0009620510061725485, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 2209310.0, "repeat_count": 0.0, "routers_loss": 0.0011601937003433704, "skip_count": 0.0, "step": 1512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 8.8040786598689, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.32421875, "learning_rate": 0.0009619026515370621, "loss": 0.0181, "macro_f1": 0.5507246255874634, "num_tokens": 2212000.0, "repeat_count": 0.0, "routers_loss": 0.01588837057352066, "skip_count": 2.0, "step": 1514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 8.815731973780043, "f1_execute": 0.9393939971923828, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.37890625, "learning_rate": 0.0009617540189685648, "loss": 0.0189, "macro_f1": 0.47979801893234253, "num_tokens": 2214514.0, "repeat_count": 1.0, "routers_loss": 0.08212276548147202, "skip_count": 3.0, "step": 1516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 8.827385287691188, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.33203125, "learning_rate": 0.0009616051085564905, "loss": 0.019, "macro_f1": 0.32863849401474, "num_tokens": 2217321.0, "repeat_count": 0.0, "routers_loss": 0.02148258499801159, "skip_count": 0.0, "step": 1518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.839038601602331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009614559203904407, "loss": 0.0154, "macro_f1": 0.3333333432674408, "num_tokens": 2220257.0, "repeat_count": 0.0, "routers_loss": 0.0017791842110455036, "skip_count": 0.0, "step": 1520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.0009613064545601841, "loss": 0.0165, "macro_f1": 0.3333333432674408, "num_tokens": 2222826.0, "repeat_count": 0.0, "routers_loss": 0.0046244338154792786, "skip_count": 0.0, "step": 1522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.862345229424617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3203125, "learning_rate": 0.0009611567111556561, "loss": 0.0192, "macro_f1": 0.3333333432674408, "num_tokens": 2225503.0, "repeat_count": 0.0, "routers_loss": 0.0021657489705830812, "skip_count": 0.0, "step": 1524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.87399854333576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.173828125, "learning_rate": 0.0009610066902669592, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 2228611.0, "repeat_count": 0.0, "routers_loss": 0.00322137214243412, "skip_count": 0.0, "step": 1526, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 33.0, "epoch": 8.885651857246904, "f1_execute": 0.9523809552192688, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.34765625, "learning_rate": 0.0009608563919843633, "loss": 0.0193, "macro_f1": 0.5396825671195984, "num_tokens": 2231454.0, "repeat_count": 0.0, "routers_loss": 0.05385033041238785, "skip_count": 6.0, "step": 1528, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 8.897305171158049, "f1_execute": 0.9850746393203735, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.5625, "learning_rate": 0.0009607058163983047, "loss": 0.0227, "macro_f1": 0.8839138746261597, "num_tokens": 2234257.0, "repeat_count": 2.0, "routers_loss": 0.1544652283191681, "skip_count": 1.0, "step": 1530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.189453125, "learning_rate": 0.0009605549635993867, "loss": 0.0182, "macro_f1": 0.3333333432674408, "num_tokens": 2237307.0, "repeat_count": 0.0, "routers_loss": 0.0010812574764713645, "skip_count": 0.0, "step": 1532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.29296875, "learning_rate": 0.0009604038336783796, "loss": 0.0201, "macro_f1": 0.6666666865348816, "num_tokens": 2240486.0, "repeat_count": 0.0, "routers_loss": 0.010306981392204762, "skip_count": 1.0, "step": 1534, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.4000000059604645, "avg_layers": 35.0, "epoch": 8.932265112891479, "f1_execute": 0.9523809552192688, "f1_repeat": 1.0, "f1_skip": 0.5714285969734192, "grad_norm": 0.38671875, "learning_rate": 0.0009602524267262203, "loss": 0.0183, "macro_f1": 0.841269850730896, "num_tokens": 2244456.0, "repeat_count": 1.0, "routers_loss": 0.06706848740577698, "skip_count": 5.0, "step": 1536, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.943918426802622, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.318359375, "learning_rate": 0.000960100742834012, "loss": 0.0174, "macro_f1": 0.5507246255874634, "num_tokens": 2247223.0, "repeat_count": 0.0, "routers_loss": 0.013069516979157925, "skip_count": 1.0, "step": 1538, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.140625, "learning_rate": 0.0009599487820930255, "loss": 0.0154, "macro_f1": 0.6666666865348816, "num_tokens": 2249950.0, "repeat_count": 0.0, "routers_loss": 0.0037508602254092693, "skip_count": 1.0, "step": 1540, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 8.967225054624908, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2021484375, "learning_rate": 0.0009597965445946972, "loss": 0.0133, "macro_f1": 0.6616915464401245, "num_tokens": 2253189.0, "repeat_count": 1.0, "routers_loss": 0.027775101363658905, "skip_count": 2.0, "step": 1542, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 8.978878368536053, "f1_execute": 0.9677419066429138, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, "grad_norm": 0.69140625, "learning_rate": 0.0009596440304306308, "loss": 0.0271, "macro_f1": 0.8559139966964722, "num_tokens": 2255836.0, "repeat_count": 3.0, "routers_loss": 0.08376672863960266, "skip_count": 2.0, "step": 1544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 8.990531682447196, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009594912396925958, "loss": 0.0159, "macro_f1": 0.3333333432674408, "num_tokens": 2258753.0, "repeat_count": 0.0, "routers_loss": 0.006385433487594128, "skip_count": 0.0, "step": 1546, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 9.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0009593381724725286, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 2260944.0, "repeat_count": 2.0, "routers_loss": 0.006614699959754944, "skip_count": 0.0, "step": 1548, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 9.011653313911143, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.45703125, "learning_rate": 0.0009591848288625315, "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 2263210.0, "repeat_count": 1.0, "routers_loss": 0.005375010427087545, "skip_count": 0.0, "step": 1550, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.023306627822286, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.328125, "learning_rate": 0.0009590312089548739, "loss": 0.0175, "macro_f1": 1.0, "num_tokens": 2266715.0, "repeat_count": 1.0, "routers_loss": 0.009920787066221237, "skip_count": 2.0, "step": 1552, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.17578125, "learning_rate": 0.0009588773128419905, "loss": 0.0125, "macro_f1": 1.0, "num_tokens": 2269324.0, "repeat_count": 1.0, "routers_loss": 0.008531320840120316, "skip_count": 2.0, "step": 1554, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.046613255644575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.31640625, "learning_rate": 0.0009587231406164831, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 2272516.0, "repeat_count": 0.0, "routers_loss": 0.0037369178608059883, "skip_count": 0.0, "step": 1556, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30078125, "learning_rate": 0.0009585686923711188, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 2275143.0, "repeat_count": 0.0, "routers_loss": 0.0021213325671851635, "skip_count": 0.0, "step": 1558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.069919883466861, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.23046875, "learning_rate": 0.0009584139681988313, "loss": 0.0151, "macro_f1": 0.5507246255874634, "num_tokens": 2278882.0, "repeat_count": 0.0, "routers_loss": 0.01237131655216217, "skip_count": 2.0, "step": 1560, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.081573197378004, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.3046875, "learning_rate": 0.0009582589681927201, "loss": 0.0158, "macro_f1": 0.661835789680481, "num_tokens": 2281975.0, "repeat_count": 1.0, "routers_loss": 0.01142028346657753, "skip_count": 1.0, "step": 1562, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.093226511289147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009581036924460511, "loss": 0.0142, "macro_f1": 0.3333333432674408, "num_tokens": 2284631.0, "repeat_count": 0.0, "routers_loss": 0.0014380351640284061, "skip_count": 0.0, "step": 1564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.10487982520029, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.21875, "learning_rate": 0.0009579481410522556, "loss": 0.0168, "macro_f1": 0.6666666865348816, "num_tokens": 2287318.0, "repeat_count": 0.0, "routers_loss": 0.010679463855922222, "skip_count": 1.0, "step": 1566, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 9.116533139111436, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.25390625, "learning_rate": 0.0009577923141049308, "loss": 0.0107, "macro_f1": 1.0, "num_tokens": 2290180.0, "repeat_count": 3.0, "routers_loss": 0.005918140057474375, "skip_count": 6.0, "step": 1568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.128186453022579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009576362116978401, "loss": 0.0145, "macro_f1": 0.3333333432674408, "num_tokens": 2292953.0, "repeat_count": 0.0, "routers_loss": 0.002247856231406331, "skip_count": 0.0, "step": 1570, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 9.139839766933722, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2578125, "learning_rate": 0.0009574798339249124, "loss": 0.0141, "macro_f1": 1.0, "num_tokens": 2295722.0, "repeat_count": 2.0, "routers_loss": 0.013855984434485435, "skip_count": 2.0, "step": 1572, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.151493080844865, "f1_execute": 0.9354838728904724, "f1_repeat": 0.5, "f1_skip": 0.6666666865348816, "grad_norm": 0.255859375, "learning_rate": 0.0009573231808802421, "loss": 0.0153, "macro_f1": 0.7007169127464294, "num_tokens": 2298702.0, "repeat_count": 3.0, "routers_loss": 0.04777120426297188, "skip_count": 4.0, "step": 1574, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.163146394756009, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.36328125, "learning_rate": 0.0009571662526580897, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 2301617.0, "repeat_count": 0.0, "routers_loss": 0.0017366715474054217, "skip_count": 0.0, "step": 1576, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.174799708667152, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.30078125, "learning_rate": 0.0009570090493528809, "loss": 0.0142, "macro_f1": 0.5507246255874634, "num_tokens": 2304110.0, "repeat_count": 0.0, "routers_loss": 0.021260440349578857, "skip_count": 2.0, "step": 1578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.186453022578295, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2734375, "learning_rate": 0.0009568515710592069, "loss": 0.0129, "macro_f1": 0.32863849401474, "num_tokens": 2307780.0, "repeat_count": 0.0, "routers_loss": 0.01021614670753479, "skip_count": 1.0, "step": 1580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.19810633648944, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0009566938178718245, "loss": 0.0107, "macro_f1": 0.32863849401474, "num_tokens": 2310687.0, "repeat_count": 1.0, "routers_loss": 0.017660068348050117, "skip_count": 0.0, "step": 1582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.209759650400583, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.0009565357898856561, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 2313968.0, "repeat_count": 0.0, "routers_loss": 0.0027978660073131323, "skip_count": 0.0, "step": 1584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.221412964311726, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.287109375, "learning_rate": 0.0009563774871957894, "loss": 0.0132, "macro_f1": 0.545751690864563, "num_tokens": 2316685.0, "repeat_count": 1.0, "routers_loss": 0.02142636477947235, "skip_count": 2.0, "step": 1586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.259765625, "learning_rate": 0.0009562189098974767, "loss": 0.0147, "macro_f1": 0.6666666865348816, "num_tokens": 2319698.0, "repeat_count": 0.0, "routers_loss": 0.006325713358819485, "skip_count": 2.0, "step": 1588, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 9.244719592134013, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.251953125, "learning_rate": 0.0009560600580861365, "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 2322311.0, "repeat_count": 1.0, "routers_loss": 0.0031695247162133455, "skip_count": 0.0, "step": 1590, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.256372906045156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.44140625, "learning_rate": 0.0009559009318573519, "loss": 0.0165, "macro_f1": 0.6666666865348816, "num_tokens": 2324869.0, "repeat_count": 0.0, "routers_loss": 0.007290478330105543, "skip_count": 2.0, "step": 1592, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 39.0, "epoch": 9.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.310546875, "learning_rate": 0.0009557415313068716, "loss": 0.0133, "macro_f1": 1.0, "num_tokens": 2327547.0, "repeat_count": 4.0, "routers_loss": 0.008649119175970554, "skip_count": 1.0, "step": 1594, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 9.279679533867444, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.3203125, "learning_rate": 0.0009555818565306084, "loss": 0.0153, "macro_f1": 0.6139194369316101, "num_tokens": 2331203.0, "repeat_count": 0.0, "routers_loss": 0.014886928722262383, "skip_count": 4.0, "step": 1596, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 9.291332847778587, "f1_execute": 0.9696969985961914, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.498046875, "learning_rate": 0.0009554219076246415, "loss": 0.0125, "macro_f1": 0.8232323527336121, "num_tokens": 2333495.0, "repeat_count": 1.0, "routers_loss": 0.015020898543298244, "skip_count": 3.0, "step": 1598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.30298616168973, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2294921875, "learning_rate": 0.0009552616846852138, "loss": 0.0167, "macro_f1": 0.5507246255874634, "num_tokens": 2336771.0, "repeat_count": 0.0, "routers_loss": 0.012441723607480526, "skip_count": 1.0, "step": 1600, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 9.314639475600874, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.330078125, "learning_rate": 0.0009551011878087337, "loss": 0.0113, "macro_f1": 0.5454546213150024, "num_tokens": 2339502.0, "repeat_count": 0.0, "routers_loss": 0.032267991453409195, "skip_count": 4.0, "step": 1602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.328125, "learning_rate": 0.0009549404170917744, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 2342494.0, "repeat_count": 0.0, "routers_loss": 0.003980159293860197, "skip_count": 0.0, "step": 1604, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 9.33794610342316, "f1_execute": 0.9687499403953552, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.5234375, "learning_rate": 0.0009547793726310737, "loss": 0.0194, "macro_f1": 0.8784722685813904, "num_tokens": 2345618.0, "repeat_count": 1.0, "routers_loss": 0.015991652384400368, "skip_count": 2.0, "step": 1606, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 9.349599417334304, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.306640625, "learning_rate": 0.0009546180545235343, "loss": 0.0138, "macro_f1": 0.5950249433517456, "num_tokens": 2348650.0, "repeat_count": 0.0, "routers_loss": 0.025174306705594063, "skip_count": 3.0, "step": 1608, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.361252731245449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.375, "learning_rate": 0.0009544564628662233, "loss": 0.014, "macro_f1": 0.6666666865348816, "num_tokens": 2351559.0, "repeat_count": 0.0, "routers_loss": 0.007719031535089016, "skip_count": 2.0, "step": 1610, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.372906045156592, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21484375, "learning_rate": 0.0009542945977563729, "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2355318.0, "repeat_count": 0.0, "routers_loss": 0.009787659160792828, "skip_count": 0.0, "step": 1612, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 9.384559359067735, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.55078125, "learning_rate": 0.0009541324592913791, "loss": 0.0133, "macro_f1": 0.5454546213150024, "num_tokens": 2358519.0, "repeat_count": 0.0, "routers_loss": 0.02144056186079979, "skip_count": 3.0, "step": 1614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 9.396212672978878, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.2236328125, "learning_rate": 0.000953970047568803, "loss": 0.0172, "macro_f1": 0.5950249433517456, "num_tokens": 2360996.0, "repeat_count": 0.0, "routers_loss": 0.01228449959307909, "skip_count": 3.0, "step": 1616, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.407865986890021, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.00095380736268637, "loss": 0.0152, "macro_f1": 0.3333333432674408, "num_tokens": 2364950.0, "repeat_count": 0.0, "routers_loss": 0.0016463599167764187, "skip_count": 0.0, "step": 1618, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.419519300801165, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.220703125, "learning_rate": 0.0009536444047419695, "loss": 0.0151, "macro_f1": 0.32380953431129456, "num_tokens": 2368041.0, "repeat_count": 1.0, "routers_loss": 0.07967822253704071, "skip_count": 1.0, "step": 1620, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 9.43117261471231, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.68359375, "learning_rate": 0.0009534811738336557, "loss": 0.0125, "macro_f1": 0.32863849401474, "num_tokens": 2370564.0, "repeat_count": 0.0, "routers_loss": 0.016032027080655098, "skip_count": 0.0, "step": 1622, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.442825928623453, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.39453125, "learning_rate": 0.0009533176700596466, "loss": 0.0159, "macro_f1": 0.5507246255874634, "num_tokens": 2373277.0, "repeat_count": 0.0, "routers_loss": 0.026493791490793228, "skip_count": 2.0, "step": 1624, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 9.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.36328125, "learning_rate": 0.0009531538935183251, "loss": 0.0225, "macro_f1": 1.0, "num_tokens": 2376131.0, "repeat_count": 1.0, "routers_loss": 0.002049001632258296, "skip_count": 1.0, "step": 1626, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3671875, "learning_rate": 0.000952989844308237, "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 2379367.0, "repeat_count": 0.0, "routers_loss": 0.0017097246600314975, "skip_count": 0.0, "step": 1628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.477785870356882, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.640625, "learning_rate": 0.0009528255225280936, "loss": 0.0312, "macro_f1": 0.32380953431129456, "num_tokens": 2381774.0, "repeat_count": 0.0, "routers_loss": 0.02108234353363514, "skip_count": 2.0, "step": 1630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.439453125, "learning_rate": 0.0009526609282767691, "loss": 0.0195, "macro_f1": 0.3333333432674408, "num_tokens": 2384783.0, "repeat_count": 0.0, "routers_loss": 0.004978402983397245, "skip_count": 0.0, "step": 1632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.501092498179169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.302734375, "learning_rate": 0.0009524960616533023, "loss": 0.018, "macro_f1": 0.3333333432674408, "num_tokens": 2387798.0, "repeat_count": 0.0, "routers_loss": 0.0028558403719216585, "skip_count": 0.0, "step": 1634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.512745812090314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.455078125, "learning_rate": 0.0009523309227568955, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 2390788.0, "repeat_count": 0.0, "routers_loss": 0.008327784948050976, "skip_count": 0.0, "step": 1636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 9.524399126001457, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.423828125, "learning_rate": 0.000952165511686915, "loss": 0.0154, "macro_f1": 0.32380953431129456, "num_tokens": 2394060.0, "repeat_count": 0.0, "routers_loss": 0.03066965378820896, "skip_count": 1.0, "step": 1638, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0009519998285428908, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 2397032.0, "repeat_count": 0.0, "routers_loss": 0.003996845334768295, "skip_count": 0.0, "step": 1640, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 9.547705753823744, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.55078125, "learning_rate": 0.0009518338734245169, "loss": 0.012, "macro_f1": 0.661835789680481, "num_tokens": 2400014.0, "repeat_count": 1.0, "routers_loss": 0.0055440873838961124, "skip_count": 1.0, "step": 1642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.559359067734887, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.390625, "learning_rate": 0.0009516676464316505, "loss": 0.017, "macro_f1": 0.32863849401474, "num_tokens": 2402913.0, "repeat_count": 0.0, "routers_loss": 0.015986017882823944, "skip_count": 1.0, "step": 1644, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009515011476643126, "loss": 0.0172, "macro_f1": 0.3333333432674408, "num_tokens": 2405545.0, "repeat_count": 0.0, "routers_loss": 0.004096328280866146, "skip_count": 0.0, "step": 1646, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.582665695557175, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.142578125, "learning_rate": 0.000951334377222688, "loss": 0.009, "macro_f1": 1.0, "num_tokens": 2408265.0, "repeat_count": 2.0, "routers_loss": 0.014345947653055191, "skip_count": 4.0, "step": 1648, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 9.594319009468318, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.236328125, "learning_rate": 0.0009511673352071244, "loss": 0.0136, "macro_f1": 0.6666666865348816, "num_tokens": 2410932.0, "repeat_count": 1.0, "routers_loss": 0.007184064947068691, "skip_count": 0.0, "step": 1650, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.605972323379461, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4375, "learning_rate": 0.0009510000217181333, "loss": 0.0141, "macro_f1": 0.32380953431129456, "num_tokens": 2413432.0, "repeat_count": 0.0, "routers_loss": 0.037248507142066956, "skip_count": 2.0, "step": 1652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0009508324368563895, "loss": 0.0138, "macro_f1": 0.3333333432674408, "num_tokens": 2416131.0, "repeat_count": 0.0, "routers_loss": 0.0027060818392783403, "skip_count": 0.0, "step": 1654, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.38671875, "learning_rate": 0.0009506645807227311, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2419042.0, "repeat_count": 0.0, "routers_loss": 0.009012538008391857, "skip_count": 0.0, "step": 1656, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.279296875, "learning_rate": 0.0009504964534181594, "loss": 0.012, "macro_f1": 0.3333333432674408, "num_tokens": 2422097.0, "repeat_count": 0.0, "routers_loss": 0.0012631615391001105, "skip_count": 0.0, "step": 1658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.0009503280550438387, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 2424680.0, "repeat_count": 0.0, "routers_loss": 0.001259277225472033, "skip_count": 0.0, "step": 1660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.66423889293518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.453125, "learning_rate": 0.0009501593857010968, "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 2427368.0, "repeat_count": 0.0, "routers_loss": 0.001040598377585411, "skip_count": 0.0, "step": 1662, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.79296875, "learning_rate": 0.0009499904454914241, "loss": 0.0206, "macro_f1": 0.6666666865348816, "num_tokens": 2430210.0, "repeat_count": 0.0, "routers_loss": 0.0062296828255057335, "skip_count": 1.0, "step": 1664, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 9.687545520757466, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.3203125, "learning_rate": 0.0009498212345164742, "loss": 0.0189, "macro_f1": 0.8839138746261597, "num_tokens": 2433747.0, "repeat_count": 1.0, "routers_loss": 0.021873408928513527, "skip_count": 2.0, "step": 1666, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.326171875, "learning_rate": 0.0009496517528780637, "loss": 0.0148, "macro_f1": 0.3333333432674408, "num_tokens": 2436727.0, "repeat_count": 0.0, "routers_loss": 0.008581871166825294, "skip_count": 0.0, "step": 1668, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 38.0, "epoch": 9.710852148579752, "f1_execute": 0.9677419066429138, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.578125, "learning_rate": 0.0009494820006781718, "loss": 0.0138, "macro_f1": 0.8225806951522827, "num_tokens": 2439826.0, "repeat_count": 3.0, "routers_loss": 0.03817279264330864, "skip_count": 3.0, "step": 1670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.546875, "learning_rate": 0.000949311978018941, "loss": 0.02, "macro_f1": 0.6666666865348816, "num_tokens": 2442928.0, "repeat_count": 0.0, "routers_loss": 0.0063996645621955395, "skip_count": 1.0, "step": 1672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.734158776402039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.41015625, "learning_rate": 0.0009491416850026758, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 2445626.0, "repeat_count": 0.0, "routers_loss": 0.0047863260842859745, "skip_count": 0.0, "step": 1674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.150390625, "learning_rate": 0.0009489711217318441, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 2448543.0, "repeat_count": 0.0, "routers_loss": 0.002730612177401781, "skip_count": 0.0, "step": 1676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.757465404224327, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.30078125, "learning_rate": 0.000948800288309076, "loss": 0.0235, "macro_f1": 0.5507246255874634, "num_tokens": 2451170.0, "repeat_count": 0.0, "routers_loss": 0.019617855548858643, "skip_count": 2.0, "step": 1678, "text_loss": 0.0 }, { "acc_repeat": 0.800000011920929, "acc_skip": 0.6666666865348816, "avg_layers": 38.0, "epoch": 9.76911871813547, "f1_execute": 0.9655172228813171, "f1_repeat": 0.888888955116272, "f1_skip": 0.800000011920929, "grad_norm": 0.373046875, "learning_rate": 0.0009486291848371642, "loss": 0.0154, "macro_f1": 0.8848021030426025, "num_tokens": 2453967.0, "repeat_count": 5.0, "routers_loss": 0.049637570977211, "skip_count": 3.0, "step": 1680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.294921875, "learning_rate": 0.0009484578114190641, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 2457529.0, "repeat_count": 0.0, "routers_loss": 0.00348113221116364, "skip_count": 0.0, "step": 1682, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.792425345957756, "f1_execute": 0.9655172228813171, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.267578125, "learning_rate": 0.0009482861681578932, "loss": 0.0156, "macro_f1": 0.8773946762084961, "num_tokens": 2460756.0, "repeat_count": 3.0, "routers_loss": 0.053441185504198074, "skip_count": 4.0, "step": 1684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.0009481142551569317, "loss": 0.0117, "macro_f1": 0.3333333432674408, "num_tokens": 2463894.0, "repeat_count": 0.0, "routers_loss": 0.006894325837492943, "skip_count": 0.0, "step": 1686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 32.0, "epoch": 9.815731973780043, "f1_execute": 0.9841269850730896, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.2578125, "learning_rate": 0.0009479420725196219, "loss": 0.0122, "macro_f1": 0.6243386268615723, "num_tokens": 2466647.0, "repeat_count": 0.0, "routers_loss": 0.03566410765051842, "skip_count": 5.0, "step": 1688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.827385287691188, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.275390625, "learning_rate": 0.0009477696203495684, "loss": 0.0146, "macro_f1": 0.545751690864563, "num_tokens": 2469810.0, "repeat_count": 1.0, "routers_loss": 0.036238450556993484, "skip_count": 2.0, "step": 1690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 9.839038601602331, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.23828125, "learning_rate": 0.0009475968987505378, "loss": 0.0256, "macro_f1": 0.5507246255874634, "num_tokens": 2473051.0, "repeat_count": 0.0, "routers_loss": 0.03732121363282204, "skip_count": 2.0, "step": 1692, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.850691915513474, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1845703125, "learning_rate": 0.0009474239078264594, "loss": 0.0128, "macro_f1": 1.0, "num_tokens": 2477744.0, "repeat_count": 2.0, "routers_loss": 0.013940983451902866, "skip_count": 3.0, "step": 1694, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 9.862345229424617, "f1_execute": 0.9855071902275085, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0009472506476814238, "loss": 0.0156, "macro_f1": 0.5507246255874634, "num_tokens": 2480689.0, "repeat_count": 2.0, "routers_loss": 0.06919925659894943, "skip_count": 0.0, "step": 1696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.87399854333576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.47265625, "learning_rate": 0.000947077118419684, "loss": 0.0178, "macro_f1": 0.6666666865348816, "num_tokens": 2483072.0, "repeat_count": 0.0, "routers_loss": 0.0036897105164825916, "skip_count": 2.0, "step": 1698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.885651857246904, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.5234375, "learning_rate": 0.0009469033201456551, "loss": 0.0153, "macro_f1": 0.6666666865348816, "num_tokens": 2485221.0, "repeat_count": 0.0, "routers_loss": 0.008924064226448536, "skip_count": 1.0, "step": 1700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.897305171158049, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.0009467292529639138, "loss": 0.0129, "macro_f1": 0.6616915464401245, "num_tokens": 2487708.0, "repeat_count": 1.0, "routers_loss": 0.03707941249012947, "skip_count": 2.0, "step": 1702, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28515625, "learning_rate": 0.0009465549169791984, "loss": 0.0138, "macro_f1": 0.3333333432674408, "num_tokens": 2490211.0, "repeat_count": 0.0, "routers_loss": 0.007260111626237631, "skip_count": 0.0, "step": 1704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.3984375, "learning_rate": 0.0009463803122964094, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 2493150.0, "repeat_count": 0.0, "routers_loss": 0.0025850224774330854, "skip_count": 0.0, "step": 1706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.932265112891479, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1689453125, "learning_rate": 0.0009462054390206087, "loss": 0.0154, "macro_f1": 0.661835789680481, "num_tokens": 2495762.0, "repeat_count": 1.0, "routers_loss": 0.044719398021698, "skip_count": 1.0, "step": 1708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 9.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.5078125, "learning_rate": 0.0009460302972570198, "loss": 0.0246, "macro_f1": 0.6666666865348816, "num_tokens": 2498633.0, "repeat_count": 0.0, "routers_loss": 0.0053523811511695385, "skip_count": 1.0, "step": 1710, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 9.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0009458548871110282, "loss": 0.0173, "macro_f1": 0.3333333432674408, "num_tokens": 2501148.0, "repeat_count": 0.0, "routers_loss": 0.0034294279757887125, "skip_count": 0.0, "step": 1712, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 9.967225054624908, "f1_execute": 0.9705882668495178, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 0.310546875, "learning_rate": 0.0009456792086881803, "loss": 0.0165, "macro_f1": 0.4901960790157318, "num_tokens": 2503914.0, "repeat_count": 3.0, "routers_loss": 0.040717653930187225, "skip_count": 0.0, "step": 1714, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 9.978878368536053, "f1_execute": 0.9850746393203735, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.318359375, "learning_rate": 0.0009455032620941839, "loss": 0.0194, "macro_f1": 0.8839138746261597, "num_tokens": 2507003.0, "repeat_count": 2.0, "routers_loss": 0.021585552021861076, "skip_count": 1.0, "step": 1716, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 9.990531682447196, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.478515625, "learning_rate": 0.0009453270474349089, "loss": 0.0176, "macro_f1": 0.6616915464401245, "num_tokens": 2509469.0, "repeat_count": 1.0, "routers_loss": 0.02745453268289566, "skip_count": 2.0, "step": 1718, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 10.0, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.234375, "learning_rate": 0.0009451505648163857, "loss": 0.0161, "macro_f1": 0.9470900297164917, "num_tokens": 2512160.0, "repeat_count": 1.0, "routers_loss": 0.06495889276266098, "skip_count": 4.0, "step": 1720, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1845703125, "learning_rate": 0.0009449738143448064, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 2515340.0, "repeat_count": 0.0, "routers_loss": 0.006183023098856211, "skip_count": 2.0, "step": 1722, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 10.023306627822286, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.3046875, "learning_rate": 0.0009447967961265241, "loss": 0.0167, "macro_f1": 0.661835789680481, "num_tokens": 2518927.0, "repeat_count": 0.0, "routers_loss": 0.0065905326046049595, "skip_count": 1.0, "step": 1724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.03495994173343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1982421875, "learning_rate": 0.000944619510268053, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 2522409.0, "repeat_count": 0.0, "routers_loss": 0.003395139006897807, "skip_count": 0.0, "step": 1726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.046613255644575, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.294921875, "learning_rate": 0.0009444419568760685, "loss": 0.0124, "macro_f1": 0.5507246255874634, "num_tokens": 2525375.0, "repeat_count": 0.0, "routers_loss": 0.015316382050514221, "skip_count": 2.0, "step": 1728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.0009442641360574067, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2528205.0, "repeat_count": 0.0, "routers_loss": 0.004907318856567144, "skip_count": 0.0, "step": 1730, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 10.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.21484375, "learning_rate": 0.0009440860479190647, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 2531058.0, "repeat_count": 0.0, "routers_loss": 0.00809244904667139, "skip_count": 3.0, "step": 1732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 10.081573197378004, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.3046875, "learning_rate": 0.0009439076925682006, "loss": 0.0103, "macro_f1": 0.5950249433517456, "num_tokens": 2533588.0, "repeat_count": 0.0, "routers_loss": 0.009379192255437374, "skip_count": 3.0, "step": 1734, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 10.093226511289147, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.0009437290701121335, "loss": 0.0136, "macro_f1": 0.661835789680481, "num_tokens": 2537054.0, "repeat_count": 1.0, "routers_loss": 0.025010397657752037, "skip_count": 1.0, "step": 1736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.10487982520029, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10400390625, "learning_rate": 0.0009435501806583424, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 2539540.0, "repeat_count": 0.0, "routers_loss": 0.003671285230666399, "skip_count": 0.0, "step": 1738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.294921875, "learning_rate": 0.0009433710243144679, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 2542056.0, "repeat_count": 0.0, "routers_loss": 0.000758337031584233, "skip_count": 0.0, "step": 1740, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.128186453022579, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0009431916011883104, "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 2545173.0, "repeat_count": 0.0, "routers_loss": 0.004188171122223139, "skip_count": 0.0, "step": 1742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.333984375, "learning_rate": 0.0009430119113878316, "loss": 0.0164, "macro_f1": 0.6666666865348816, "num_tokens": 2547842.0, "repeat_count": 0.0, "routers_loss": 0.006144850514829159, "skip_count": 1.0, "step": 1744, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.151493080844865, "f1_execute": 0.970588207244873, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.51171875, "learning_rate": 0.0009428319550211531, "loss": 0.0141, "macro_f1": 0.656862735748291, "num_tokens": 2550613.0, "repeat_count": 1.0, "routers_loss": 0.027903100475668907, "skip_count": 1.0, "step": 1746, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.163146394756009, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.197265625, "learning_rate": 0.0009426517321965568, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 2553136.0, "repeat_count": 2.0, "routers_loss": 0.004126264713704586, "skip_count": 4.0, "step": 1748, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 10.174799708667152, "f1_execute": 0.9687499403953552, "f1_repeat": 0.6666666865348816, "f1_skip": 0.800000011920929, "grad_norm": 0.326171875, "learning_rate": 0.0009424712430224856, "loss": 0.0128, "macro_f1": 0.8118056058883667, "num_tokens": 2555828.0, "repeat_count": 2.0, "routers_loss": 0.03868389129638672, "skip_count": 3.0, "step": 1750, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.291015625, "learning_rate": 0.0009422904876075419, "loss": 0.0132, "macro_f1": 0.6666666865348816, "num_tokens": 2559542.0, "repeat_count": 0.0, "routers_loss": 0.005023921374231577, "skip_count": 2.0, "step": 1752, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.19810633648944, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1484375, "learning_rate": 0.0009421094660604889, "loss": 0.0133, "macro_f1": 0.6666666865348816, "num_tokens": 2562533.0, "repeat_count": 0.0, "routers_loss": 0.0014512044144794345, "skip_count": 1.0, "step": 1754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 10.209759650400583, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.396484375, "learning_rate": 0.0009419281784902498, "loss": 0.0137, "macro_f1": 0.32380953431129456, "num_tokens": 2564946.0, "repeat_count": 0.0, "routers_loss": 0.02674366720020771, "skip_count": 1.0, "step": 1756, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.221412964311726, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.359375, "learning_rate": 0.0009417466250059073, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2567723.0, "repeat_count": 0.0, "routers_loss": 0.0018899759743362665, "skip_count": 0.0, "step": 1758, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 10.23306627822287, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.31640625, "learning_rate": 0.000941564805716705, "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 2570539.0, "repeat_count": 1.0, "routers_loss": 0.0035795916337519884, "skip_count": 0.0, "step": 1760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 33.0, "epoch": 10.244719592134013, "f1_execute": 0.9687499403953552, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 0.185546875, "learning_rate": 0.0009413827207320457, "loss": 0.0139, "macro_f1": 0.5729166865348816, "num_tokens": 2574054.0, "repeat_count": 0.0, "routers_loss": 0.029305750504136086, "skip_count": 5.0, "step": 1762, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.256372906045156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.291015625, "learning_rate": 0.0009412003701614926, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 2576665.0, "repeat_count": 0.0, "routers_loss": 0.003538660239428282, "skip_count": 0.0, "step": 1764, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0009410177541147682, "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 2579201.0, "repeat_count": 0.0, "routers_loss": 0.002020627958700061, "skip_count": 0.0, "step": 1766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1904296875, "learning_rate": 0.0009408348727017554, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 2581982.0, "repeat_count": 0.0, "routers_loss": 0.0011426011333242059, "skip_count": 0.0, "step": 1768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1845703125, "learning_rate": 0.0009406517260324961, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 2584955.0, "repeat_count": 0.0, "routers_loss": 0.0011462711263448, "skip_count": 0.0, "step": 1770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.076171875, "learning_rate": 0.0009404683142171923, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 2588243.0, "repeat_count": 0.0, "routers_loss": 0.0018581905169412494, "skip_count": 2.0, "step": 1772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2431640625, "learning_rate": 0.0009402846373662051, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 2591042.0, "repeat_count": 0.0, "routers_loss": 0.005229715257883072, "skip_count": 0.0, "step": 1774, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 10.326292789512017, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.45703125, "learning_rate": 0.0009401006955900555, "loss": 0.0131, "macro_f1": 0.5950249433517456, "num_tokens": 2593526.0, "repeat_count": 0.0, "routers_loss": 0.027999576181173325, "skip_count": 3.0, "step": 1776, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 10.33794610342316, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.294921875, "learning_rate": 0.0009399164889994238, "loss": 0.0161, "macro_f1": 0.8839138746261597, "num_tokens": 2596527.0, "repeat_count": 1.0, "routers_loss": 0.015344873070716858, "skip_count": 2.0, "step": 1778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.166015625, "learning_rate": 0.0009397320177051494, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 2599587.0, "repeat_count": 0.0, "routers_loss": 0.0013256485108286142, "skip_count": 2.0, "step": 1780, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.361252731245449, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0009395472818182314, "loss": 0.0101, "macro_f1": 1.0, "num_tokens": 2602713.0, "repeat_count": 1.0, "routers_loss": 0.012089124880731106, "skip_count": 2.0, "step": 1782, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 10.372906045156592, "f1_execute": 0.970588207244873, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.267578125, "learning_rate": 0.0009393622814498274, "loss": 0.0154, "macro_f1": 0.4901960790157318, "num_tokens": 2606152.0, "repeat_count": 0.0, "routers_loss": 0.01893945410847664, "skip_count": 2.0, "step": 1784, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 10.384559359067735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.000939177016711255, "loss": 0.0112, "macro_f1": 0.6666666865348816, "num_tokens": 2608989.0, "repeat_count": 1.0, "routers_loss": 0.006936570163816214, "skip_count": 0.0, "step": 1786, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 38.0, "epoch": 10.396212672978878, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.279296875, "learning_rate": 0.0009389914877139902, "loss": 0.0085, "macro_f1": 0.8835979700088501, "num_tokens": 2612675.0, "repeat_count": 3.0, "routers_loss": 0.023881442844867706, "skip_count": 2.0, "step": 1788, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 10.407865986890021, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.2060546875, "learning_rate": 0.0009388056945696687, "loss": 0.0093, "macro_f1": 0.5950249433517456, "num_tokens": 2615351.0, "repeat_count": 0.0, "routers_loss": 0.01233698334544897, "skip_count": 2.0, "step": 1790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.419519300801165, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26953125, "learning_rate": 0.0009386196373900843, "loss": 0.0127, "macro_f1": 0.32863849401474, "num_tokens": 2618679.0, "repeat_count": 0.0, "routers_loss": 0.014367182739078999, "skip_count": 1.0, "step": 1792, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 10.43117261471231, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.236328125, "learning_rate": 0.0009384333162871904, "loss": 0.0116, "macro_f1": 0.5454546213150024, "num_tokens": 2621591.0, "repeat_count": 0.0, "routers_loss": 0.042170200496912, "skip_count": 4.0, "step": 1794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0009382467313730985, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 2624473.0, "repeat_count": 0.0, "routers_loss": 0.0011267588706687093, "skip_count": 0.0, "step": 1796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 10.454479242534596, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.283203125, "learning_rate": 0.0009380598827600795, "loss": 0.0165, "macro_f1": 0.5950249433517456, "num_tokens": 2627311.0, "repeat_count": 0.0, "routers_loss": 0.022293567657470703, "skip_count": 3.0, "step": 1798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.000937872770560563, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 2630100.0, "repeat_count": 0.0, "routers_loss": 0.0025520627386868, "skip_count": 0.0, "step": 1800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0009376853948871364, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2632871.0, "repeat_count": 0.0, "routers_loss": 0.001963576301932335, "skip_count": 0.0, "step": 1802, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.298828125, "learning_rate": 0.0009374977558525464, "loss": 0.0113, "macro_f1": 0.3333333432674408, "num_tokens": 2635599.0, "repeat_count": 0.0, "routers_loss": 0.006379332393407822, "skip_count": 0.0, "step": 1804, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.501092498179169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1962890625, "learning_rate": 0.0009373098535696979, "loss": 0.017, "macro_f1": 0.6666666865348816, "num_tokens": 2638492.0, "repeat_count": 0.0, "routers_loss": 0.0068961153738200665, "skip_count": 1.0, "step": 1806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.512745812090314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.265625, "learning_rate": 0.0009371216881516542, "loss": 0.0103, "macro_f1": 0.6666666865348816, "num_tokens": 2641376.0, "repeat_count": 0.0, "routers_loss": 0.003879483323544264, "skip_count": 1.0, "step": 1808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.524399126001457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.734375, "learning_rate": 0.0009369332597116371, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 2644048.0, "repeat_count": 0.0, "routers_loss": 0.00244098249822855, "skip_count": 0.0, "step": 1810, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.5360524399126, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.306640625, "learning_rate": 0.0009367445683630262, "loss": 0.0108, "macro_f1": 0.5507246255874634, "num_tokens": 2646912.0, "repeat_count": 0.0, "routers_loss": 0.026474134996533394, "skip_count": 2.0, "step": 1812, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.547705753823744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2197265625, "learning_rate": 0.0009365556142193599, "loss": 0.0131, "macro_f1": 1.0, "num_tokens": 2649756.0, "repeat_count": 1.0, "routers_loss": 0.003460227744653821, "skip_count": 2.0, "step": 1814, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.559359067734887, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.234375, "learning_rate": 0.0009363663973943343, "loss": 0.0093, "macro_f1": 0.5507246255874634, "num_tokens": 2652538.0, "repeat_count": 0.0, "routers_loss": 0.02064656652510166, "skip_count": 2.0, "step": 1816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.353515625, "learning_rate": 0.0009361769180018038, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 2656253.0, "repeat_count": 0.0, "routers_loss": 0.002829131903126836, "skip_count": 0.0, "step": 1818, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.582665695557175, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1865234375, "learning_rate": 0.0009359871761557806, "loss": 0.0126, "macro_f1": 0.5507246255874634, "num_tokens": 2659530.0, "repeat_count": 0.0, "routers_loss": 0.021498242393136024, "skip_count": 2.0, "step": 1820, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.594319009468318, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.359375, "learning_rate": 0.0009357971719704348, "loss": 0.0115, "macro_f1": 1.0, "num_tokens": 2662100.0, "repeat_count": 1.0, "routers_loss": 0.0019448766252025962, "skip_count": 3.0, "step": 1822, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 34.0, "epoch": 10.605972323379461, "f1_execute": 0.9830508232116699, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.431640625, "learning_rate": 0.0009356069055600948, "loss": 0.0182, "macro_f1": 0.9573132395744324, "num_tokens": 2665122.0, "repeat_count": 2.0, "routers_loss": 0.016598768532276154, "skip_count": 5.0, "step": 1824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.388671875, "learning_rate": 0.0009354163770392461, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 2667915.0, "repeat_count": 0.0, "routers_loss": 0.00219886121340096, "skip_count": 0.0, "step": 1826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.57421875, "learning_rate": 0.0009352255865225323, "loss": 0.0172, "macro_f1": 0.3333333432674408, "num_tokens": 2670272.0, "repeat_count": 0.0, "routers_loss": 0.0034747247118502855, "skip_count": 0.0, "step": 1828, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0009350345341247549, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2672603.0, "repeat_count": 0.0, "routers_loss": 0.006974602583795786, "skip_count": 0.0, "step": 1830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.652585579024034, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.21484375, "learning_rate": 0.0009348432199608725, "loss": 0.016, "macro_f1": 0.5507246255874634, "num_tokens": 2675311.0, "repeat_count": 0.0, "routers_loss": 0.00839470885694027, "skip_count": 2.0, "step": 1832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.66423889293518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19140625, "learning_rate": 0.0009346516441460014, "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 2678263.0, "repeat_count": 0.0, "routers_loss": 0.011625363491475582, "skip_count": 0.0, "step": 1834, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.390625, "learning_rate": 0.0009344598067954151, "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 2681504.0, "repeat_count": 0.0, "routers_loss": 0.0040642269887030125, "skip_count": 0.0, "step": 1836, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 10.687545520757466, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.240234375, "learning_rate": 0.000934267708024545, "loss": 0.0123, "macro_f1": 1.0, "num_tokens": 2684385.0, "repeat_count": 1.0, "routers_loss": 0.008675353601574898, "skip_count": 1.0, "step": 1838, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.310546875, "learning_rate": 0.0009340753479489797, "loss": 0.0139, "macro_f1": 0.6666666865348816, "num_tokens": 2687434.0, "repeat_count": 0.0, "routers_loss": 0.014445917680859566, "skip_count": 2.0, "step": 1840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.710852148579752, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2353515625, "learning_rate": 0.0009338827266844643, "loss": 0.0114, "macro_f1": 0.545751690864563, "num_tokens": 2691173.0, "repeat_count": 1.0, "routers_loss": 0.052480436861515045, "skip_count": 2.0, "step": 1842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2734375, "learning_rate": 0.0009336898443469019, "loss": 0.0145, "macro_f1": 0.6666666865348816, "num_tokens": 2694225.0, "repeat_count": 0.0, "routers_loss": 0.004331192933022976, "skip_count": 2.0, "step": 1844, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.734158776402039, "f1_execute": 0.9687499403953552, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1875, "learning_rate": 0.0009334967010523523, "loss": 0.0106, "macro_f1": 0.8784722685813904, "num_tokens": 2697587.0, "repeat_count": 1.0, "routers_loss": 0.0400407575070858, "skip_count": 4.0, "step": 1846, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.745812090313184, "f1_execute": 0.9841269850730896, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.2412109375, "learning_rate": 0.0009333032969170325, "loss": 0.0116, "macro_f1": 0.8835979700088501, "num_tokens": 2700421.0, "repeat_count": 2.0, "routers_loss": 0.0367719791829586, "skip_count": 3.0, "step": 1848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 10.757465404224327, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1328125, "learning_rate": 0.0009331096320573163, "loss": 0.0127, "macro_f1": 0.5507246255874634, "num_tokens": 2703131.0, "repeat_count": 0.0, "routers_loss": 0.02010459266602993, "skip_count": 2.0, "step": 1850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 10.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1640625, "learning_rate": 0.0009329157065897345, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 2706151.0, "repeat_count": 0.0, "routers_loss": 0.00399455102160573, "skip_count": 4.0, "step": 1852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.306640625, "learning_rate": 0.0009327215206309746, "loss": 0.0146, "macro_f1": 0.6666666865348816, "num_tokens": 2709332.0, "repeat_count": 0.0, "routers_loss": 0.006474701222032309, "skip_count": 2.0, "step": 1854, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 10.792425345957756, "f1_execute": 0.9841269850730896, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.298828125, "learning_rate": 0.0009325270742978808, "loss": 0.0139, "macro_f1": 0.6243386268615723, "num_tokens": 2712053.0, "repeat_count": 0.0, "routers_loss": 0.017992746084928513, "skip_count": 4.0, "step": 1856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.294921875, "learning_rate": 0.0009323323677074542, "loss": 0.0139, "macro_f1": 0.6666666865348816, "num_tokens": 2715136.0, "repeat_count": 0.0, "routers_loss": 0.009141829796135426, "skip_count": 1.0, "step": 1858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26171875, "learning_rate": 0.0009321374009768524, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 2717988.0, "repeat_count": 0.0, "routers_loss": 0.002829873003065586, "skip_count": 0.0, "step": 1860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.827385287691188, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.48046875, "learning_rate": 0.0009319421742233894, "loss": 0.0151, "macro_f1": 0.3333333432674408, "num_tokens": 2720792.0, "repeat_count": 0.0, "routers_loss": 0.001398273278027773, "skip_count": 0.0, "step": 1862, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 10.839038601602331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.216796875, "learning_rate": 0.0009317466875645356, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 2723395.0, "repeat_count": 0.0, "routers_loss": 0.0075300014577806, "skip_count": 2.0, "step": 1864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 32.0, "epoch": 10.850691915513474, "f1_execute": 0.952380895614624, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.263671875, "learning_rate": 0.0009315509411179181, "loss": 0.0183, "macro_f1": 0.5396825671195984, "num_tokens": 2725862.0, "repeat_count": 0.0, "routers_loss": 0.030897624790668488, "skip_count": 5.0, "step": 1866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.862345229424617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0009313549350013204, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 2729220.0, "repeat_count": 0.0, "routers_loss": 0.0020411531440913677, "skip_count": 0.0, "step": 1868, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 10.87399854333576, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0009311586693326816, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 2731672.0, "repeat_count": 2.0, "routers_loss": 0.0032862448133528233, "skip_count": 0.0, "step": 1870, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.885651857246904, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.244140625, "learning_rate": 0.0009309621442300975, "loss": 0.01, "macro_f1": 0.32380953431129456, "num_tokens": 2734885.0, "repeat_count": 0.0, "routers_loss": 0.014469991438090801, "skip_count": 2.0, "step": 1872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.897305171158049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.29296875, "learning_rate": 0.0009307653598118199, "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2737652.0, "repeat_count": 0.0, "routers_loss": 0.001454834477044642, "skip_count": 0.0, "step": 1874, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 10.908958485069192, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.0009305683161962568, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 2740344.0, "repeat_count": 1.0, "routers_loss": 0.008459417149424553, "skip_count": 0.0, "step": 1876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0009303710135019718, "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 2743890.0, "repeat_count": 0.0, "routers_loss": 0.0008385899709537625, "skip_count": 0.0, "step": 1878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.932265112891479, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.0009301734518476847, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 2747156.0, "repeat_count": 0.0, "routers_loss": 0.002069163601845503, "skip_count": 0.0, "step": 1880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.28125, "learning_rate": 0.0009299756313522707, "loss": 0.0157, "macro_f1": 0.6666666865348816, "num_tokens": 2749482.0, "repeat_count": 0.0, "routers_loss": 0.0021297491621226072, "skip_count": 1.0, "step": 1882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 10.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1513671875, "learning_rate": 0.0009297775521347613, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 2752301.0, "repeat_count": 0.0, "routers_loss": 0.004468953236937523, "skip_count": 1.0, "step": 1884, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.46484375, "learning_rate": 0.0009295792143143434, "loss": 0.0178, "macro_f1": 0.3333333432674408, "num_tokens": 2754843.0, "repeat_count": 0.0, "routers_loss": 0.0021415241062641144, "skip_count": 0.0, "step": 1886, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 10.978878368536053, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.265625, "learning_rate": 0.0009293806180103594, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 2757818.0, "repeat_count": 0.0, "routers_loss": 0.003056338056921959, "skip_count": 0.0, "step": 1888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 10.990531682447196, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.359375, "learning_rate": 0.0009291817633423076, "loss": 0.014, "macro_f1": 0.5950249433517456, "num_tokens": 2760322.0, "repeat_count": 0.0, "routers_loss": 0.011240174062550068, "skip_count": 2.0, "step": 1890, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.328125, "learning_rate": 0.0009289826504298414, "loss": 0.0143, "macro_f1": 0.3333333432674408, "num_tokens": 2763376.0, "repeat_count": 0.0, "routers_loss": 0.0027451212517917156, "skip_count": 0.0, "step": 1892, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.30859375, "learning_rate": 0.0009287832793927693, "loss": 0.0111, "macro_f1": 0.6666666865348816, "num_tokens": 2766593.0, "repeat_count": 0.0, "routers_loss": 0.006713921204209328, "skip_count": 2.0, "step": 1894, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.023306627822286, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2373046875, "learning_rate": 0.0009285836503510562, "loss": 0.0119, "macro_f1": 1.0, "num_tokens": 2769340.0, "repeat_count": 1.0, "routers_loss": 0.011625412851572037, "skip_count": 2.0, "step": 1896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 11.03495994173343, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1787109375, "learning_rate": 0.0009283837634248212, "loss": 0.008, "macro_f1": 0.5507246255874634, "num_tokens": 2771743.0, "repeat_count": 0.0, "routers_loss": 0.008203956298530102, "skip_count": 2.0, "step": 1898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 11.046613255644575, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.0009281836187343389, "loss": 0.0113, "macro_f1": 0.6615384817123413, "num_tokens": 2774723.0, "repeat_count": 1.0, "routers_loss": 0.024051113054156303, "skip_count": 3.0, "step": 1900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.058266569555718, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.0009279832164000391, "loss": 0.0135, "macro_f1": 0.3188405930995941, "num_tokens": 2777726.0, "repeat_count": 1.0, "routers_loss": 0.07300238311290741, "skip_count": 2.0, "step": 1902, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2373046875, "learning_rate": 0.0009277825565425067, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 2780745.0, "repeat_count": 0.0, "routers_loss": 0.0010444490471854806, "skip_count": 0.0, "step": 1904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1845703125, "learning_rate": 0.0009275816392824812, "loss": 0.011, "macro_f1": 0.6666666865348816, "num_tokens": 2784099.0, "repeat_count": 0.0, "routers_loss": 0.007250302471220493, "skip_count": 2.0, "step": 1906, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.093226511289147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4140625, "learning_rate": 0.0009273804647408575, "loss": 0.0107, "macro_f1": 0.3333333432674408, "num_tokens": 2786660.0, "repeat_count": 0.0, "routers_loss": 0.0017673892434686422, "skip_count": 0.0, "step": 1908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.10487982520029, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.35546875, "learning_rate": 0.0009271790330386848, "loss": 0.0118, "macro_f1": 0.6616915464401245, "num_tokens": 2789083.0, "repeat_count": 1.0, "routers_loss": 0.0396571047604084, "skip_count": 2.0, "step": 1910, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.116533139111436, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.0009269773442971672, "loss": 0.0105, "macro_f1": 0.32863849401474, "num_tokens": 2791886.0, "repeat_count": 0.0, "routers_loss": 0.012936257757246494, "skip_count": 1.0, "step": 1912, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 11.128186453022579, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.392578125, "learning_rate": 0.0009267753986376637, "loss": 0.0129, "macro_f1": 0.8837606906890869, "num_tokens": 2794746.0, "repeat_count": 2.0, "routers_loss": 0.025228694081306458, "skip_count": 2.0, "step": 1914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 32.0, "epoch": 11.139839766933722, "f1_execute": 0.9841269850730896, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.2294921875, "learning_rate": 0.0009265731961816876, "loss": 0.0081, "macro_f1": 0.6243386268615723, "num_tokens": 2797257.0, "repeat_count": 0.0, "routers_loss": 0.010382485575973988, "skip_count": 5.0, "step": 1916, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.3359375, "learning_rate": 0.0009263707370509069, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 2799845.0, "repeat_count": 0.0, "routers_loss": 0.007449906785041094, "skip_count": 1.0, "step": 1918, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.163146394756009, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2578125, "learning_rate": 0.0009261680213671438, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 2802915.0, "repeat_count": 0.0, "routers_loss": 0.002620422514155507, "skip_count": 0.0, "step": 1920, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.174799708667152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.361328125, "learning_rate": 0.0009259650492523752, "loss": 0.0113, "macro_f1": 0.6666666865348816, "num_tokens": 2805282.0, "repeat_count": 0.0, "routers_loss": 0.007958375848829746, "skip_count": 2.0, "step": 1922, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 32.0, "epoch": 11.186453022578295, "f1_execute": 0.9830508232116699, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.29296875, "learning_rate": 0.0009257618208287321, "loss": 0.0128, "macro_f1": 0.9640473127365112, "num_tokens": 2807922.0, "repeat_count": 1.0, "routers_loss": 0.01677585206925869, "skip_count": 6.0, "step": 1924, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.19810633648944, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2109375, "learning_rate": 0.0009255583362184998, "loss": 0.0102, "macro_f1": 0.6616915464401245, "num_tokens": 2810710.0, "repeat_count": 1.0, "routers_loss": 0.011521116830408573, "skip_count": 2.0, "step": 1926, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 11.209759650400583, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0009253545955441177, "loss": 0.012, "macro_f1": 0.661835789680481, "num_tokens": 2813613.0, "repeat_count": 1.0, "routers_loss": 0.020628267899155617, "skip_count": 1.0, "step": 1928, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 11.221412964311726, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.0009251505989281792, "loss": 0.0067, "macro_f1": 0.661835789680481, "num_tokens": 2816675.0, "repeat_count": 1.0, "routers_loss": 0.016508756205439568, "skip_count": 1.0, "step": 1930, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.23306627822287, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.000924946346493432, "loss": 0.0086, "macro_f1": 0.32863849401474, "num_tokens": 2819389.0, "repeat_count": 0.0, "routers_loss": 0.025203371420502663, "skip_count": 1.0, "step": 1932, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5703125, "learning_rate": 0.0009247418383627773, "loss": 0.0127, "macro_f1": 0.3333333432674408, "num_tokens": 2822578.0, "repeat_count": 0.0, "routers_loss": 0.0016746397595852613, "skip_count": 0.0, "step": 1934, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.256372906045156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0009245370746592705, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 2825461.0, "repeat_count": 0.0, "routers_loss": 0.0015024561434984207, "skip_count": 0.0, "step": 1936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.177734375, "learning_rate": 0.0009243320555061206, "loss": 0.0119, "macro_f1": 0.6666666865348816, "num_tokens": 2827953.0, "repeat_count": 0.0, "routers_loss": 0.004051640164107084, "skip_count": 2.0, "step": 1938, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.279679533867444, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.431640625, "learning_rate": 0.0009241267810266903, "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 2830850.0, "repeat_count": 5.0, "routers_loss": 0.002451489679515362, "skip_count": 6.0, "step": 1940, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 11.291332847778587, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.162109375, "learning_rate": 0.0009239212513444962, "loss": 0.0111, "macro_f1": 1.0, "num_tokens": 2833776.0, "repeat_count": 1.0, "routers_loss": 0.00272200140170753, "skip_count": 1.0, "step": 1942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.26171875, "learning_rate": 0.0009237154665832082, "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 2836723.0, "repeat_count": 0.0, "routers_loss": 0.006044249981641769, "skip_count": 2.0, "step": 1944, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.181640625, "learning_rate": 0.0009235094268666498, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 2839746.0, "repeat_count": 0.0, "routers_loss": 0.005176035221666098, "skip_count": 0.0, "step": 1946, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.23046875, "learning_rate": 0.0009233031323187976, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 2842621.0, "repeat_count": 0.0, "routers_loss": 0.001398799940943718, "skip_count": 2.0, "step": 1948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.140625, "learning_rate": 0.000923096583063782, "loss": 0.0131, "macro_f1": 0.6666666865348816, "num_tokens": 2846051.0, "repeat_count": 0.0, "routers_loss": 0.0030323336832225323, "skip_count": 2.0, "step": 1950, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 11.349599417334304, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009228897792258865, "loss": 0.0137, "macro_f1": 0.6666666865348816, "num_tokens": 2848646.0, "repeat_count": 1.0, "routers_loss": 0.005735136102885008, "skip_count": 0.0, "step": 1952, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.361252731245449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.375, "learning_rate": 0.0009226827209295476, "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2851715.0, "repeat_count": 0.0, "routers_loss": 0.0032814207952469587, "skip_count": 0.0, "step": 1954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 11.372906045156592, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.322265625, "learning_rate": 0.0009224754082993551, "loss": 0.0149, "macro_f1": 0.5507246255874634, "num_tokens": 2854661.0, "repeat_count": 0.0, "routers_loss": 0.052133575081825256, "skip_count": 2.0, "step": 1956, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.384559359067735, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1982421875, "learning_rate": 0.0009222678414600519, "loss": 0.0113, "macro_f1": 0.661835789680481, "num_tokens": 2857467.0, "repeat_count": 1.0, "routers_loss": 0.02456739731132984, "skip_count": 1.0, "step": 1958, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.396212672978878, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.37890625, "learning_rate": 0.0009220600205365335, "loss": 0.0184, "macro_f1": 0.32863849401474, "num_tokens": 2860459.0, "repeat_count": 0.0, "routers_loss": 0.010598164983093739, "skip_count": 1.0, "step": 1960, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.407865986890021, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.240234375, "learning_rate": 0.0009218519456538486, "loss": 0.0133, "macro_f1": 0.661835789680481, "num_tokens": 2862913.0, "repeat_count": 1.0, "routers_loss": 0.01706852577626705, "skip_count": 1.0, "step": 1962, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.419519300801165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.236328125, "learning_rate": 0.0009216436169371989, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 2865837.0, "repeat_count": 0.0, "routers_loss": 0.0008655673009343445, "skip_count": 0.0, "step": 1964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.43117261471231, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2109375, "learning_rate": 0.0009214350345119383, "loss": 0.0117, "macro_f1": 0.32863849401474, "num_tokens": 2868284.0, "repeat_count": 1.0, "routers_loss": 0.01365406159311533, "skip_count": 0.0, "step": 1966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 11.442825928623453, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.177734375, "learning_rate": 0.0009212261985035739, "loss": 0.0144, "macro_f1": 0.5507246255874634, "num_tokens": 2871384.0, "repeat_count": 0.0, "routers_loss": 0.026312991976737976, "skip_count": 2.0, "step": 1968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.427734375, "learning_rate": 0.0009210171090377647, "loss": 0.0172, "macro_f1": 0.3333333432674408, "num_tokens": 2874326.0, "repeat_count": 0.0, "routers_loss": 0.00256463629193604, "skip_count": 0.0, "step": 1970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.333984375, "learning_rate": 0.000920807766240323, "loss": 0.0147, "macro_f1": 0.6666666865348816, "num_tokens": 2876804.0, "repeat_count": 0.0, "routers_loss": 0.009263782761991024, "skip_count": 1.0, "step": 1972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.40234375, "learning_rate": 0.000920598170237213, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2879856.0, "repeat_count": 0.0, "routers_loss": 0.0017648438224568963, "skip_count": 0.0, "step": 1974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 11.489439184268026, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1953125, "learning_rate": 0.0009203883211545516, "loss": 0.0077, "macro_f1": 0.6139194369316101, "num_tokens": 2882567.0, "repeat_count": 0.0, "routers_loss": 0.014991719275712967, "skip_count": 4.0, "step": 1976, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 11.501092498179169, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009201782191186077, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 2885525.0, "repeat_count": 1.0, "routers_loss": 0.00945229735225439, "skip_count": 0.0, "step": 1978, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.512745812090314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.384765625, "learning_rate": 0.0009199678642558023, "loss": 0.0162, "macro_f1": 0.3333333432674408, "num_tokens": 2888205.0, "repeat_count": 0.0, "routers_loss": 0.005159673746675253, "skip_count": 0.0, "step": 1980, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 11.524399126001457, "f1_execute": 0.9677419066429138, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 0.51953125, "learning_rate": 0.0009197572566927091, "loss": 0.0129, "macro_f1": 0.8305171728134155, "num_tokens": 2890932.0, "repeat_count": 2.0, "routers_loss": 0.03451452776789665, "skip_count": 4.0, "step": 1982, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.294921875, "learning_rate": 0.0009195463965560531, "loss": 0.0133, "macro_f1": 0.3333333432674408, "num_tokens": 2894167.0, "repeat_count": 0.0, "routers_loss": 0.004142904654145241, "skip_count": 0.0, "step": 1984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.547705753823744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2255859375, "learning_rate": 0.0009193352839727121, "loss": 0.0118, "macro_f1": 0.3333333432674408, "num_tokens": 2897395.0, "repeat_count": 0.0, "routers_loss": 0.0017827711999416351, "skip_count": 0.0, "step": 1986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.559359067734887, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0009191239190697151, "loss": 0.0122, "macro_f1": 0.3333333432674408, "num_tokens": 2899998.0, "repeat_count": 0.0, "routers_loss": 0.002281503053382039, "skip_count": 0.0, "step": 1988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 32.0, "epoch": 11.57101238164603, "f1_execute": 0.9677419066429138, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.30859375, "learning_rate": 0.0009189123019742432, "loss": 0.0111, "macro_f1": 0.5892473459243774, "num_tokens": 2902509.0, "repeat_count": 0.0, "routers_loss": 0.026906557381153107, "skip_count": 6.0, "step": 1990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 11.582665695557175, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.2314453125, "learning_rate": 0.0009187004328136293, "loss": 0.0132, "macro_f1": 0.4901960790157318, "num_tokens": 2905670.0, "repeat_count": 0.0, "routers_loss": 0.03854414448142052, "skip_count": 3.0, "step": 1992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1904296875, "learning_rate": 0.0009184883117153577, "loss": 0.0183, "macro_f1": 0.3333333432674408, "num_tokens": 2908685.0, "repeat_count": 0.0, "routers_loss": 0.002587145660072565, "skip_count": 0.0, "step": 1994, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0009182759388070649, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 2911516.0, "repeat_count": 0.0, "routers_loss": 0.005528806243091822, "skip_count": 0.0, "step": 1996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 11.617625637290605, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.12158203125, "learning_rate": 0.0009180633142165384, "loss": 0.0112, "macro_f1": 0.5950249433517456, "num_tokens": 2915647.0, "repeat_count": 0.0, "routers_loss": 0.01538484450429678, "skip_count": 3.0, "step": 1998, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2490234375, "learning_rate": 0.0009178504380717169, "loss": 0.0119, "macro_f1": 0.3333333432674408, "num_tokens": 2919056.0, "repeat_count": 0.0, "routers_loss": 0.007219294086098671, "skip_count": 0.0, "step": 2000, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 11.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.255859375, "learning_rate": 0.0009176373105006915, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 2923239.0, "repeat_count": 0.0, "routers_loss": 0.003927659709006548, "skip_count": 5.0, "step": 2002, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.15625, "learning_rate": 0.0009174239316317032, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 2925989.0, "repeat_count": 0.0, "routers_loss": 0.00832356233149767, "skip_count": 2.0, "step": 2004, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 11.66423889293518, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2734375, "learning_rate": 0.0009172103015931454, "loss": 0.0173, "macro_f1": 1.0, "num_tokens": 2928796.0, "repeat_count": 2.0, "routers_loss": 0.004353287164121866, "skip_count": 2.0, "step": 2006, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.296875, "learning_rate": 0.0009169964205135621, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 2931326.0, "repeat_count": 0.0, "routers_loss": 0.004138830583542585, "skip_count": 0.0, "step": 2008, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 11.687545520757466, "f1_execute": 0.9687499403953552, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.25390625, "learning_rate": 0.0009167822885216481, "loss": 0.0132, "macro_f1": 0.8784722685813904, "num_tokens": 2934419.0, "repeat_count": 1.0, "routers_loss": 0.06500927358865738, "skip_count": 4.0, "step": 2010, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4765625, "learning_rate": 0.0009165679057462499, "loss": 0.0228, "macro_f1": 0.3333333432674408, "num_tokens": 2937904.0, "repeat_count": 0.0, "routers_loss": 0.0016313480446115136, "skip_count": 0.0, "step": 2012, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.710852148579752, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2890625, "learning_rate": 0.0009163532723163641, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 2940379.0, "repeat_count": 0.0, "routers_loss": 0.0077081345953047276, "skip_count": 0.0, "step": 2014, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2578125, "learning_rate": 0.000916138388361139, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 2943966.0, "repeat_count": 0.0, "routers_loss": 0.002444128505885601, "skip_count": 0.0, "step": 2016, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.734158776402039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.345703125, "learning_rate": 0.0009159232540098725, "loss": 0.0147, "macro_f1": 0.3333333432674408, "num_tokens": 2947344.0, "repeat_count": 0.0, "routers_loss": 0.005669345147907734, "skip_count": 0.0, "step": 2018, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2255859375, "learning_rate": 0.0009157078693920143, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 2950271.0, "repeat_count": 0.0, "routers_loss": 0.01392229925841093, "skip_count": 2.0, "step": 2020, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.244140625, "learning_rate": 0.0009154922346371641, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 2953192.0, "repeat_count": 0.0, "routers_loss": 0.001756921410560608, "skip_count": 2.0, "step": 2022, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.0009152763498750723, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 2956272.0, "repeat_count": 0.0, "routers_loss": 0.001235690084286034, "skip_count": 0.0, "step": 2024, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.173828125, "learning_rate": 0.0009150602152356394, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 2959195.0, "repeat_count": 0.0, "routers_loss": 0.0009728042059578001, "skip_count": 2.0, "step": 2026, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 11.792425345957756, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.224609375, "learning_rate": 0.0009148438308489167, "loss": 0.0105, "macro_f1": 0.5950249433517456, "num_tokens": 2962987.0, "repeat_count": 0.0, "routers_loss": 0.021171389147639275, "skip_count": 3.0, "step": 2028, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 11.8040786598689, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.3984375, "learning_rate": 0.0009146271968451056, "loss": 0.014, "macro_f1": 0.661835789680481, "num_tokens": 2966493.0, "repeat_count": 1.0, "routers_loss": 0.04419359937310219, "skip_count": 1.0, "step": 2030, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.232421875, "learning_rate": 0.0009144103133545576, "loss": 0.0201, "macro_f1": 0.3333333432674408, "num_tokens": 2969073.0, "repeat_count": 0.0, "routers_loss": 0.0015578239690512419, "skip_count": 0.0, "step": 2032, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.827385287691188, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2373046875, "learning_rate": 0.0009141931805077743, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2971857.0, "repeat_count": 0.0, "routers_loss": 0.002162780612707138, "skip_count": 0.0, "step": 2034, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 11.839038601602331, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.275390625, "learning_rate": 0.0009139757984354078, "loss": 0.0124, "macro_f1": 0.6666666865348816, "num_tokens": 2975259.0, "repeat_count": 1.0, "routers_loss": 0.006888649892061949, "skip_count": 0.0, "step": 2036, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 11.850691915513474, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2890625, "learning_rate": 0.0009137581672682594, "loss": 0.0115, "macro_f1": 0.5507246255874634, "num_tokens": 2978168.0, "repeat_count": 0.0, "routers_loss": 0.012387235648930073, "skip_count": 2.0, "step": 2038, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.862345229424617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.439453125, "learning_rate": 0.0009135402871372809, "loss": 0.0109, "macro_f1": 0.3333333432674408, "num_tokens": 2980926.0, "repeat_count": 0.0, "routers_loss": 0.003205870511010289, "skip_count": 0.0, "step": 2040, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.87399854333576, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.294921875, "learning_rate": 0.0009133221581735737, "loss": 0.0125, "macro_f1": 0.32863849401474, "num_tokens": 2983515.0, "repeat_count": 0.0, "routers_loss": 0.028576457872986794, "skip_count": 1.0, "step": 2042, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 11.885651857246904, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.171875, "learning_rate": 0.0009131037805083888, "loss": 0.0106, "macro_f1": 1.0, "num_tokens": 2986947.0, "repeat_count": 1.0, "routers_loss": 0.011476028710603714, "skip_count": 3.0, "step": 2044, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 11.897305171158049, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.21875, "learning_rate": 0.0009128851542731271, "loss": 0.0108, "macro_f1": 1.0, "num_tokens": 2990005.0, "repeat_count": 1.0, "routers_loss": 0.024261275306344032, "skip_count": 5.0, "step": 2046, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 33.0, "epoch": 11.908958485069192, "f1_execute": 0.9687499403953552, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 0.189453125, "learning_rate": 0.0009126662795993388, "loss": 0.0098, "macro_f1": 0.5729166865348816, "num_tokens": 2992924.0, "repeat_count": 0.0, "routers_loss": 0.025440583005547523, "skip_count": 5.0, "step": 2048, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0009124471566187238, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 2995983.0, "repeat_count": 0.0, "routers_loss": 0.004794388078153133, "skip_count": 0.0, "step": 2050, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 11.932265112891479, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1162109375, "learning_rate": 0.0009122277854631313, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 2999045.0, "repeat_count": 1.0, "routers_loss": 0.0011246383655816317, "skip_count": 1.0, "step": 2052, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 11.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.197265625, "learning_rate": 0.00091200816626456, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3002033.0, "repeat_count": 0.0, "routers_loss": 0.0020106551237404346, "skip_count": 1.0, "step": 2054, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.330078125, "learning_rate": 0.0009117882991551576, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3004257.0, "repeat_count": 0.0, "routers_loss": 0.003389935242012143, "skip_count": 0.0, "step": 2056, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2177734375, "learning_rate": 0.000911568184267221, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 3006697.0, "repeat_count": 0.0, "routers_loss": 0.001681700930930674, "skip_count": 0.0, "step": 2058, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.978878368536053, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.265625, "learning_rate": 0.0009113478217331963, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3009402.0, "repeat_count": 0.0, "routers_loss": 0.001484179520048201, "skip_count": 0.0, "step": 2060, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 11.990531682447196, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.0009111272116856788, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 3012312.0, "repeat_count": 0.0, "routers_loss": 0.0022270777262747288, "skip_count": 0.0, "step": 2062, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 12.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0009109063542574123, "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 3014592.0, "repeat_count": 2.0, "routers_loss": 0.0017252127872779965, "skip_count": 0.0, "step": 2064, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 12.011653313911143, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.3828125, "learning_rate": 0.0009106852495812896, "loss": 0.0093, "macro_f1": 0.5507246255874634, "num_tokens": 3017490.0, "repeat_count": 0.0, "routers_loss": 0.009529488161206245, "skip_count": 2.0, "step": 2066, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 12.023306627822286, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6328125, "learning_rate": 0.0009104638977903526, "loss": 0.0104, "macro_f1": 0.3188405930995941, "num_tokens": 3020304.0, "repeat_count": 0.0, "routers_loss": 0.06488226354122162, "skip_count": 2.0, "step": 2068, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.03495994173343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009102422990177916, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3023561.0, "repeat_count": 0.0, "routers_loss": 0.0031205564737319946, "skip_count": 0.0, "step": 2070, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.046613255644575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2001953125, "learning_rate": 0.0009100204533969454, "loss": 0.0121, "macro_f1": 0.3333333432674408, "num_tokens": 3026407.0, "repeat_count": 0.0, "routers_loss": 0.0011402347590774298, "skip_count": 0.0, "step": 2072, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2109375, "learning_rate": 0.0009097983610613016, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 3028981.0, "repeat_count": 0.0, "routers_loss": 0.001060041948221624, "skip_count": 0.0, "step": 2074, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.255859375, "learning_rate": 0.0009095760221444959, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 3032350.0, "repeat_count": 0.0, "routers_loss": 0.003154321573674679, "skip_count": 1.0, "step": 2076, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 12.081573197378004, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.291015625, "learning_rate": 0.0009093534367803129, "loss": 0.0127, "macro_f1": 0.5507246255874634, "num_tokens": 3035528.0, "repeat_count": 0.0, "routers_loss": 0.01921572908759117, "skip_count": 2.0, "step": 2078, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.093226511289147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.000909130605102685, "loss": 0.016, "macro_f1": 0.3333333432674408, "num_tokens": 3038308.0, "repeat_count": 0.0, "routers_loss": 0.0023650568909943104, "skip_count": 0.0, "step": 2080, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.10487982520029, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.455078125, "learning_rate": 0.0009089075272456932, "loss": 0.0109, "macro_f1": 0.6616915464401245, "num_tokens": 3041256.0, "repeat_count": 1.0, "routers_loss": 0.08696327358484268, "skip_count": 2.0, "step": 2082, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 12.116533139111436, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.158203125, "learning_rate": 0.0009086842033435664, "loss": 0.0133, "macro_f1": 0.9470900297164917, "num_tokens": 3043831.0, "repeat_count": 1.0, "routers_loss": 0.03435390070080757, "skip_count": 4.0, "step": 2084, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.128186453022579, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0009084606335306813, "loss": 0.0094, "macro_f1": 0.32863849401474, "num_tokens": 3046803.0, "repeat_count": 0.0, "routers_loss": 0.01038414891809225, "skip_count": 1.0, "step": 2086, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.26171875, "learning_rate": 0.0009082368179415632, "loss": 0.0094, "macro_f1": 0.6666666865348816, "num_tokens": 3049698.0, "repeat_count": 0.0, "routers_loss": 0.007577094715088606, "skip_count": 2.0, "step": 2088, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.151493080844865, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.17578125, "learning_rate": 0.0009080127567108846, "loss": 0.0104, "macro_f1": 0.32863849401474, "num_tokens": 3052833.0, "repeat_count": 0.0, "routers_loss": 0.00685408478602767, "skip_count": 1.0, "step": 2090, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 12.163146394756009, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.27734375, "learning_rate": 0.0009077884499734666, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 3055451.0, "repeat_count": 1.0, "routers_loss": 0.0036763863172382116, "skip_count": 1.0, "step": 2092, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.174799708667152, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.0009075638978642771, "loss": 0.0106, "macro_f1": 0.6666666865348816, "num_tokens": 3058876.0, "repeat_count": 1.0, "routers_loss": 0.0007554483017884195, "skip_count": 0.0, "step": 2094, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.4296875, "learning_rate": 0.0009073391005184324, "loss": 0.0134, "macro_f1": 0.6666666865348816, "num_tokens": 3061894.0, "repeat_count": 0.0, "routers_loss": 0.004020949825644493, "skip_count": 1.0, "step": 2096, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.19810633648944, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.234375, "learning_rate": 0.000907114058071196, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 3064488.0, "repeat_count": 2.0, "routers_loss": 0.008739014156162739, "skip_count": 4.0, "step": 2098, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.209759650400583, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.349609375, "learning_rate": 0.000906888770657979, "loss": 0.012, "macro_f1": 0.6666666865348816, "num_tokens": 3067930.0, "repeat_count": 1.0, "routers_loss": 0.0019971816800534725, "skip_count": 0.0, "step": 2100, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.221412964311726, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.3671875, "learning_rate": 0.0009066632384143396, "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 3071105.0, "repeat_count": 2.0, "routers_loss": 0.00708745326846838, "skip_count": 3.0, "step": 2102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 12.23306627822287, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.248046875, "learning_rate": 0.0009064374614759838, "loss": 0.0105, "macro_f1": 0.6139194369316101, "num_tokens": 3074838.0, "repeat_count": 0.0, "routers_loss": 0.010642576031386852, "skip_count": 4.0, "step": 2104, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.0009062114399787647, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 3077678.0, "repeat_count": 0.0, "routers_loss": 0.004578464664518833, "skip_count": 2.0, "step": 2106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 12.256372906045156, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.255859375, "learning_rate": 0.0009059851740586822, "loss": 0.0112, "macro_f1": 0.545751690864563, "num_tokens": 3080489.0, "repeat_count": 1.0, "routers_loss": 0.0279643926769495, "skip_count": 2.0, "step": 2108, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1396484375, "learning_rate": 0.0009057586638518836, "loss": 0.0097, "macro_f1": 1.0, "num_tokens": 3083162.0, "repeat_count": 1.0, "routers_loss": 0.01619141735136509, "skip_count": 3.0, "step": 2110, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.279679533867444, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.0009055319094946634, "loss": 0.0103, "macro_f1": 0.661835789680481, "num_tokens": 3085694.0, "repeat_count": 1.0, "routers_loss": 0.013448198325932026, "skip_count": 1.0, "step": 2112, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 12.291332847778587, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.306640625, "learning_rate": 0.0009053049111234624, "loss": 0.0097, "macro_f1": 0.9470900297164917, "num_tokens": 3088817.0, "repeat_count": 1.0, "routers_loss": 0.015851864591240883, "skip_count": 3.0, "step": 2114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.30298616168973, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2236328125, "learning_rate": 0.0009050776688748688, "loss": 0.0073, "macro_f1": 0.6616915464401245, "num_tokens": 3091916.0, "repeat_count": 1.0, "routers_loss": 0.01326720416545868, "skip_count": 2.0, "step": 2116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09326171875, "learning_rate": 0.000904850182885617, "loss": 0.0097, "macro_f1": 0.3333333432674408, "num_tokens": 3094477.0, "repeat_count": 0.0, "routers_loss": 0.0024589013773947954, "skip_count": 0.0, "step": 2118, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 12.326292789512017, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.322265625, "learning_rate": 0.0009046224532925887, "loss": 0.0103, "macro_f1": 0.6615384817123413, "num_tokens": 3097814.0, "repeat_count": 1.0, "routers_loss": 0.017271697521209717, "skip_count": 3.0, "step": 2120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0009043944802328117, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 3100443.0, "repeat_count": 0.0, "routers_loss": 0.0020142768044024706, "skip_count": 0.0, "step": 2122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0009041662638434602, "loss": 0.013, "macro_f1": 0.3333333432674408, "num_tokens": 3103538.0, "repeat_count": 0.0, "routers_loss": 0.0007582574617117643, "skip_count": 0.0, "step": 2124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.361252731245449, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.287109375, "learning_rate": 0.0009039378042618555, "loss": 0.0114, "macro_f1": 0.661835789680481, "num_tokens": 3106301.0, "repeat_count": 1.0, "routers_loss": 0.010196556337177753, "skip_count": 1.0, "step": 2126, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.372906045156592, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0009037091016254646, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 3109965.0, "repeat_count": 1.0, "routers_loss": 0.004467344377189875, "skip_count": 0.0, "step": 2128, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 12.384559359067735, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0009034801560719011, "loss": 0.0085, "macro_f1": 0.6616915464401245, "num_tokens": 3112611.0, "repeat_count": 2.0, "routers_loss": 0.016951793804764748, "skip_count": 1.0, "step": 2130, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 32.0, "epoch": 12.396212672978878, "f1_execute": 0.9830508232116699, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.34375, "learning_rate": 0.0009032509677389241, "loss": 0.0108, "macro_f1": 0.9640473127365112, "num_tokens": 3115142.0, "repeat_count": 1.0, "routers_loss": 0.012783058919012547, "skip_count": 6.0, "step": 2132, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.407865986890021, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1357421875, "learning_rate": 0.0009030215367644399, "loss": 0.0167, "macro_f1": 0.6666666865348816, "num_tokens": 3117747.0, "repeat_count": 0.0, "routers_loss": 0.010165547020733356, "skip_count": 2.0, "step": 2134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.419519300801165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0009027918632864998, "loss": 0.0103, "macro_f1": 0.3333333432674408, "num_tokens": 3120828.0, "repeat_count": 0.0, "routers_loss": 0.0036117848940193653, "skip_count": 0.0, "step": 2136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.251953125, "learning_rate": 0.0009025619474433014, "loss": 0.0104, "macro_f1": 0.6666666865348816, "num_tokens": 3123309.0, "repeat_count": 0.0, "routers_loss": 0.002214879961684346, "skip_count": 1.0, "step": 2138, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 12.442825928623453, "f1_execute": 0.9687499403953552, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2001953125, "learning_rate": 0.0009023317893731884, "loss": 0.0088, "macro_f1": 0.8784722685813904, "num_tokens": 3126341.0, "repeat_count": 1.0, "routers_loss": 0.036413636058568954, "skip_count": 4.0, "step": 2140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0009021013892146499, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 3129606.0, "repeat_count": 0.0, "routers_loss": 0.0016914433799684048, "skip_count": 0.0, "step": 2142, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2470703125, "learning_rate": 0.0009018707471063205, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3132848.0, "repeat_count": 0.0, "routers_loss": 0.0015236898325383663, "skip_count": 0.0, "step": 2144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0009016398631869811, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 3135946.0, "repeat_count": 0.0, "routers_loss": 0.001211925526149571, "skip_count": 0.0, "step": 2146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.0009014087375955574, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 3138921.0, "repeat_count": 0.0, "routers_loss": 0.0003138465981464833, "skip_count": 0.0, "step": 2148, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 33.0, "epoch": 12.501092498179169, "f1_execute": 0.9687499403953552, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 0.2041015625, "learning_rate": 0.0009011773704711205, "loss": 0.0114, "macro_f1": 0.5729166865348816, "num_tokens": 3141433.0, "repeat_count": 0.0, "routers_loss": 0.051109060645103455, "skip_count": 5.0, "step": 2150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 12.512745812090314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11083984375, "learning_rate": 0.0009009457619528876, "loss": 0.0093, "macro_f1": 0.6666666865348816, "num_tokens": 3144098.0, "repeat_count": 0.0, "routers_loss": 0.009349791333079338, "skip_count": 4.0, "step": 2152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.524399126001457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2373046875, "learning_rate": 0.0009007139121802204, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 3146735.0, "repeat_count": 0.0, "routers_loss": 0.004418746568262577, "skip_count": 1.0, "step": 2154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.435546875, "learning_rate": 0.000900481821292626, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3149466.0, "repeat_count": 0.0, "routers_loss": 0.003090545767918229, "skip_count": 0.0, "step": 2156, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.547705753823744, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2236328125, "learning_rate": 0.0009002494894297566, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 3152548.0, "repeat_count": 1.0, "routers_loss": 0.008128263987600803, "skip_count": 2.0, "step": 2158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 12.559359067734887, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12109375, "learning_rate": 0.0009000169167314095, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 3155587.0, "repeat_count": 0.0, "routers_loss": 0.006265183910727501, "skip_count": 3.0, "step": 2160, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 12.57101238164603, "f1_execute": 0.9677419066429138, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 0.2236328125, "learning_rate": 0.0008997841033375267, "loss": 0.009, "macro_f1": 0.8305171728134155, "num_tokens": 3158625.0, "repeat_count": 2.0, "routers_loss": 0.02151372842490673, "skip_count": 4.0, "step": 2162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0008995510493881951, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3161182.0, "repeat_count": 0.0, "routers_loss": 0.0023458958603441715, "skip_count": 0.0, "step": 2164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.594319009468318, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0008993177550236464, "loss": 0.0084, "macro_f1": 0.32863849401474, "num_tokens": 3163933.0, "repeat_count": 0.0, "routers_loss": 0.007210600655525923, "skip_count": 1.0, "step": 2166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 12.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2109375, "learning_rate": 0.000899084220384257, "loss": 0.0099, "macro_f1": 0.6666666865348816, "num_tokens": 3167940.0, "repeat_count": 0.0, "routers_loss": 0.0054290457628667355, "skip_count": 1.0, "step": 2168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.000898850445610548, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 3170677.0, "repeat_count": 0.0, "routers_loss": 0.0054080430418252945, "skip_count": 0.0, "step": 2170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 31.0, "epoch": 12.629278951201748, "f1_execute": 0.9836065173149109, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.2236328125, "learning_rate": 0.0008986164308431845, "loss": 0.0109, "macro_f1": 0.63089919090271, "num_tokens": 3173252.0, "repeat_count": 0.0, "routers_loss": 0.023559805005788803, "skip_count": 6.0, "step": 2172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.0008983821762229765, "loss": 0.0099, "macro_f1": 0.3333333432674408, "num_tokens": 3175873.0, "repeat_count": 0.0, "routers_loss": 0.001991610275581479, "skip_count": 0.0, "step": 2174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.302734375, "learning_rate": 0.0008981476818908778, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 3179349.0, "repeat_count": 0.0, "routers_loss": 0.0010556118795648217, "skip_count": 0.0, "step": 2176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.66423889293518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.36328125, "learning_rate": 0.0008979129479879873, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 3182136.0, "repeat_count": 0.0, "routers_loss": 0.0008595810504630208, "skip_count": 2.0, "step": 2178, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.0008976779746555473, "loss": 0.0104, "macro_f1": 0.3333333432674408, "num_tokens": 3184716.0, "repeat_count": 0.0, "routers_loss": 0.001379984663799405, "skip_count": 0.0, "step": 2180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.687545520757466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.4921875, "learning_rate": 0.0008974427620349444, "loss": 0.014, "macro_f1": 0.3333333432674408, "num_tokens": 3187522.0, "repeat_count": 0.0, "routers_loss": 0.004347538575530052, "skip_count": 0.0, "step": 2182, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0008972073102677091, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3190532.0, "repeat_count": 0.0, "routers_loss": 0.0010899569606408477, "skip_count": 0.0, "step": 2184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.710852148579752, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.000896971619495516, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 3193819.0, "repeat_count": 0.0, "routers_loss": 0.0007734254468232393, "skip_count": 0.0, "step": 2186, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.722505462490895, "f1_execute": 0.9705882668495178, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.73046875, "learning_rate": 0.0008967356898601833, "loss": 0.0114, "macro_f1": 0.656862735748291, "num_tokens": 3195959.0, "repeat_count": 1.0, "routers_loss": 0.04526396468281746, "skip_count": 2.0, "step": 2188, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 12.734158776402039, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.4453125, "learning_rate": 0.0008964995215036731, "loss": 0.0161, "macro_f1": 1.0, "num_tokens": 3198683.0, "repeat_count": 1.0, "routers_loss": 0.006627350114285946, "skip_count": 4.0, "step": 2190, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 12.745812090313184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.36328125, "learning_rate": 0.000896263114568091, "loss": 0.0155, "macro_f1": 1.0, "num_tokens": 3201300.0, "repeat_count": 1.0, "routers_loss": 0.00941085908561945, "skip_count": 1.0, "step": 2192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.318359375, "learning_rate": 0.0008960264691956863, "loss": 0.0141, "macro_f1": 0.3333333432674408, "num_tokens": 3204202.0, "repeat_count": 0.0, "routers_loss": 0.0015670397551730275, "skip_count": 0.0, "step": 2194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 12.76911871813547, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.33984375, "learning_rate": 0.0008957895855288517, "loss": 0.0152, "macro_f1": 0.5454546213150024, "num_tokens": 3207231.0, "repeat_count": 0.0, "routers_loss": 0.02945515513420105, "skip_count": 3.0, "step": 2196, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 12.780772032046613, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2236328125, "learning_rate": 0.0008955524637101232, "loss": 0.0088, "macro_f1": 1.0, "num_tokens": 3210185.0, "repeat_count": 1.0, "routers_loss": 0.016947409138083458, "skip_count": 4.0, "step": 2198, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 12.792425345957756, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.11767578125, "learning_rate": 0.0008953151038821801, "loss": 0.0083, "macro_f1": 0.9470900297164917, "num_tokens": 3212665.0, "repeat_count": 1.0, "routers_loss": 0.014997169375419617, "skip_count": 4.0, "step": 2200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.353515625, "learning_rate": 0.0008950775061878452, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 3215856.0, "repeat_count": 0.0, "routers_loss": 0.006159586366266012, "skip_count": 0.0, "step": 2202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0008948396707700841, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 3219162.0, "repeat_count": 0.0, "routers_loss": 0.0015480044530704618, "skip_count": 0.0, "step": 2204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 12.827385287691188, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.203125, "learning_rate": 0.0008946015977720055, "loss": 0.0123, "macro_f1": 0.6666666865348816, "num_tokens": 3222046.0, "repeat_count": 0.0, "routers_loss": 0.0055579375475645065, "skip_count": 3.0, "step": 2206, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.839038601602331, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0008943632873368611, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 3224459.0, "repeat_count": 1.0, "routers_loss": 0.0074456059373915195, "skip_count": 0.0, "step": 2208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.0008941247396080456, "loss": 0.0105, "macro_f1": 0.3333333432674408, "num_tokens": 3227044.0, "repeat_count": 0.0, "routers_loss": 0.0006547431694343686, "skip_count": 0.0, "step": 2210, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.862345229424617, "f1_execute": 0.970588207244873, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.5859375, "learning_rate": 0.0008938859547290963, "loss": 0.0131, "macro_f1": 0.656862735748291, "num_tokens": 3229803.0, "repeat_count": 1.0, "routers_loss": 0.037330079823732376, "skip_count": 1.0, "step": 2212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.87399854333576, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.314453125, "learning_rate": 0.0008936469328436932, "loss": 0.0079, "macro_f1": 0.32863849401474, "num_tokens": 3232280.0, "repeat_count": 0.0, "routers_loss": 0.01933332346379757, "skip_count": 1.0, "step": 2214, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 12.885651857246904, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.3359375, "learning_rate": 0.0008934076740956591, "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 3235072.0, "repeat_count": 1.0, "routers_loss": 0.003353309817612171, "skip_count": 0.0, "step": 2216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.897305171158049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.212890625, "learning_rate": 0.0008931681786289591, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3237664.0, "repeat_count": 0.0, "routers_loss": 0.005725269205868244, "skip_count": 0.0, "step": 2218, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 12.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.310546875, "learning_rate": 0.000892928446587701, "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3240465.0, "repeat_count": 0.0, "routers_loss": 0.009353657253086567, "skip_count": 2.0, "step": 2220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 12.920611798980335, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.13671875, "learning_rate": 0.0008926884781161346, "loss": 0.0073, "macro_f1": 0.5950249433517456, "num_tokens": 3243617.0, "repeat_count": 0.0, "routers_loss": 0.016089925542473793, "skip_count": 3.0, "step": 2222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.932265112891479, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.000892448273358652, "loss": 0.0093, "macro_f1": 0.3333333432674408, "num_tokens": 3247033.0, "repeat_count": 0.0, "routers_loss": 0.007480847183614969, "skip_count": 0.0, "step": 2224, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 30.0, "epoch": 12.943918426802622, "f1_execute": 0.9666666388511658, "f1_repeat": 0.0, "f1_skip": 0.8333333134651184, "grad_norm": 0.30078125, "learning_rate": 0.0008922078324597878, "loss": 0.011, "macro_f1": 0.6000000238418579, "num_tokens": 3250077.0, "repeat_count": 0.0, "routers_loss": 0.03119707852602005, "skip_count": 6.0, "step": 2226, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 12.955571740713765, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.234375, "learning_rate": 0.0008919671555642188, "loss": 0.011, "macro_f1": 1.0, "num_tokens": 3253150.0, "repeat_count": 2.0, "routers_loss": 0.004075814038515091, "skip_count": 7.0, "step": 2228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 12.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0008917262428167629, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 3256448.0, "repeat_count": 0.0, "routers_loss": 0.005595121067017317, "skip_count": 0.0, "step": 2230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 12.978878368536053, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.142578125, "learning_rate": 0.0008914850943623809, "loss": 0.0069, "macro_f1": 0.5507246255874634, "num_tokens": 3259686.0, "repeat_count": 0.0, "routers_loss": 0.013897078111767769, "skip_count": 2.0, "step": 2232, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 12.990531682447196, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.3046875, "learning_rate": 0.0008912437103461751, "loss": 0.0151, "macro_f1": 0.8839138746261597, "num_tokens": 3263319.0, "repeat_count": 1.0, "routers_loss": 0.017135098576545715, "skip_count": 2.0, "step": 2234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.31640625, "learning_rate": 0.0008910020909133893, "loss": 0.0123, "macro_f1": 0.3333333432674408, "num_tokens": 3265808.0, "repeat_count": 0.0, "routers_loss": 0.000663192302454263, "skip_count": 0.0, "step": 2236, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0008907602362094093, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 3268338.0, "repeat_count": 0.0, "routers_loss": 0.005594483111053705, "skip_count": 0.0, "step": 2238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10693359375, "learning_rate": 0.0008905181463797623, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 3271093.0, "repeat_count": 0.0, "routers_loss": 0.001263809739612043, "skip_count": 0.0, "step": 2240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.03495994173343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.3515625, "learning_rate": 0.000890275821570117, "loss": 0.009, "macro_f1": 0.6666666865348816, "num_tokens": 3273718.0, "repeat_count": 0.0, "routers_loss": 0.002172135515138507, "skip_count": 2.0, "step": 2242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.046613255644575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.0008900332619262833, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 3276376.0, "repeat_count": 0.0, "routers_loss": 0.00584355229511857, "skip_count": 0.0, "step": 2244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11328125, "learning_rate": 0.0008897904675942128, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 3280121.0, "repeat_count": 0.0, "routers_loss": 0.0080410772934556, "skip_count": 0.0, "step": 2246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.240234375, "learning_rate": 0.000889547438719998, "loss": 0.0125, "macro_f1": 0.3333333432674408, "num_tokens": 3282505.0, "repeat_count": 0.0, "routers_loss": 0.0010983464308083057, "skip_count": 0.0, "step": 2248, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 13.081573197378004, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.4609375, "learning_rate": 0.0008893041754498726, "loss": 0.0107, "macro_f1": 1.0, "num_tokens": 3285860.0, "repeat_count": 1.0, "routers_loss": 0.00610734149813652, "skip_count": 1.0, "step": 2250, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 13.093226511289147, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.140625, "learning_rate": 0.0008890606779302115, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 3288800.0, "repeat_count": 1.0, "routers_loss": 0.020325055345892906, "skip_count": 4.0, "step": 2252, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 13.10487982520029, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1572265625, "learning_rate": 0.0008888169463075302, "loss": 0.0075, "macro_f1": 0.928205132484436, "num_tokens": 3291984.0, "repeat_count": 1.0, "routers_loss": 0.015041694045066833, "skip_count": 3.0, "step": 2254, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2216796875, "learning_rate": 0.0008885729807284854, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 3294863.0, "repeat_count": 0.0, "routers_loss": 0.004192774184048176, "skip_count": 0.0, "step": 2256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 13.128186453022579, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.16796875, "learning_rate": 0.0008883287813398746, "loss": 0.0068, "macro_f1": 0.5950249433517456, "num_tokens": 3297739.0, "repeat_count": 0.0, "routers_loss": 0.008380064740777016, "skip_count": 2.0, "step": 2258, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 13.139839766933722, "f1_execute": 0.9696969985961914, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.439453125, "learning_rate": 0.0008880843482886357, "loss": 0.0119, "macro_f1": 0.8232323527336121, "num_tokens": 3301023.0, "repeat_count": 1.0, "routers_loss": 0.032572709023952484, "skip_count": 3.0, "step": 2260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0008878396817218472, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3303984.0, "repeat_count": 0.0, "routers_loss": 0.003394267288967967, "skip_count": 0.0, "step": 2262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.163146394756009, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.29296875, "learning_rate": 0.0008875947817867287, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 3306998.0, "repeat_count": 0.0, "routers_loss": 0.0025778370909392834, "skip_count": 2.0, "step": 2264, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.174799708667152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.25390625, "learning_rate": 0.0008873496486306393, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 3310460.0, "repeat_count": 0.0, "routers_loss": 0.006037441547960043, "skip_count": 1.0, "step": 2266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.186453022578295, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.000887104282401079, "loss": 0.01, "macro_f1": 0.32863849401474, "num_tokens": 3313418.0, "repeat_count": 0.0, "routers_loss": 0.011105027981102467, "skip_count": 1.0, "step": 2268, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.19810633648944, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.0008868586832456881, "loss": 0.0089, "macro_f1": 0.32863849401474, "num_tokens": 3316248.0, "repeat_count": 0.0, "routers_loss": 0.010674779303371906, "skip_count": 1.0, "step": 2270, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 13.209759650400583, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.255859375, "learning_rate": 0.0008866128513122469, "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 3319047.0, "repeat_count": 1.0, "routers_loss": 0.002916676225140691, "skip_count": 1.0, "step": 2272, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 13.221412964311726, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.298828125, "learning_rate": 0.0008863667867486755, "loss": 0.0103, "macro_f1": 1.0, "num_tokens": 3321617.0, "repeat_count": 1.0, "routers_loss": 0.0037084603682160378, "skip_count": 1.0, "step": 2274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 13.23306627822287, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.33203125, "learning_rate": 0.0008861204897030346, "loss": 0.0141, "macro_f1": 0.6139194369316101, "num_tokens": 3324994.0, "repeat_count": 0.0, "routers_loss": 0.009762105531990528, "skip_count": 4.0, "step": 2276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.24609375, "learning_rate": 0.0008858739603235242, "loss": 0.0116, "macro_f1": 0.3333333432674408, "num_tokens": 3327573.0, "repeat_count": 0.0, "routers_loss": 0.0056715779937803745, "skip_count": 0.0, "step": 2278, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 13.256372906045156, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.228515625, "learning_rate": 0.0008856271987584843, "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 3330387.0, "repeat_count": 0.0, "routers_loss": 0.00684339227154851, "skip_count": 3.0, "step": 2280, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.2680262199563, "f1_execute": 0.9830508232116699, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.1357421875, "learning_rate": 0.0008853802051563949, "loss": 0.0071, "macro_f1": 0.9276835918426514, "num_tokens": 3333518.0, "repeat_count": 3.0, "routers_loss": 0.014720491133630276, "skip_count": 4.0, "step": 2282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.279679533867444, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0008851329796658752, "loss": 0.0092, "macro_f1": 0.32863849401474, "num_tokens": 3337397.0, "repeat_count": 0.0, "routers_loss": 0.02213127352297306, "skip_count": 1.0, "step": 2284, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 13.291332847778587, "f1_execute": 0.9677419066429138, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 0.322265625, "learning_rate": 0.0008848855224356839, "loss": 0.0128, "macro_f1": 0.8305171728134155, "num_tokens": 3340595.0, "repeat_count": 2.0, "routers_loss": 0.054318949580192566, "skip_count": 4.0, "step": 2286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0008846378336147196, "loss": 0.011, "macro_f1": 0.3333333432674408, "num_tokens": 3343514.0, "repeat_count": 0.0, "routers_loss": 0.0008490721811540425, "skip_count": 0.0, "step": 2288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.3125, "learning_rate": 0.00088438991335202, "loss": 0.0108, "macro_f1": 0.6666666865348816, "num_tokens": 3346504.0, "repeat_count": 0.0, "routers_loss": 0.003796581644564867, "skip_count": 1.0, "step": 2290, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 13.326292789512017, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1767578125, "learning_rate": 0.0008841417617967617, "loss": 0.0079, "macro_f1": 0.6616915464401245, "num_tokens": 3349113.0, "repeat_count": 2.0, "routers_loss": 0.011256271041929722, "skip_count": 1.0, "step": 2292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1435546875, "learning_rate": 0.0008838933790982612, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3351969.0, "repeat_count": 0.0, "routers_loss": 0.005762037355452776, "skip_count": 2.0, "step": 2294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1904296875, "learning_rate": 0.0008836447654059734, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 3354494.0, "repeat_count": 0.0, "routers_loss": 0.003967686556279659, "skip_count": 2.0, "step": 2296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.361252731245449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0008833959208694928, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 3358058.0, "repeat_count": 0.0, "routers_loss": 0.0040358551777899265, "skip_count": 1.0, "step": 2298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.372906045156592, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.298828125, "learning_rate": 0.0008831468456385523, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 3360443.0, "repeat_count": 0.0, "routers_loss": 0.0037742112763226032, "skip_count": 0.0, "step": 2300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.384559359067735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.0008828975398630236, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 3363587.0, "repeat_count": 0.0, "routers_loss": 0.0006881384178996086, "skip_count": 0.0, "step": 2302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 13.396212672978878, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08203125, "learning_rate": 0.0008826480036929179, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 3366514.0, "repeat_count": 0.0, "routers_loss": 0.009305343963205814, "skip_count": 3.0, "step": 2304, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.407865986890021, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2373046875, "learning_rate": 0.0008823982372783837, "loss": 0.0092, "macro_f1": 0.3188405930995941, "num_tokens": 3369371.0, "repeat_count": 1.0, "routers_loss": 0.04386809095740318, "skip_count": 2.0, "step": 2306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.419519300801165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07958984375, "learning_rate": 0.0008821482407697094, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 3372079.0, "repeat_count": 0.0, "routers_loss": 0.005276011768728495, "skip_count": 2.0, "step": 2308, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.43117261471231, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1396484375, "learning_rate": 0.0008818980143173212, "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 3376012.0, "repeat_count": 1.0, "routers_loss": 0.0015877397963777184, "skip_count": 2.0, "step": 2310, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 13.442825928623453, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.365234375, "learning_rate": 0.0008816475580717833, "loss": 0.0125, "macro_f1": 0.4901960790157318, "num_tokens": 3378447.0, "repeat_count": 0.0, "routers_loss": 0.04406385496258736, "skip_count": 3.0, "step": 2312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1494140625, "learning_rate": 0.0008813968721837988, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 3381304.0, "repeat_count": 0.0, "routers_loss": 0.0013554419856518507, "skip_count": 0.0, "step": 2314, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 13.46613255644574, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1806640625, "learning_rate": 0.0008811459568042091, "loss": 0.0094, "macro_f1": 0.5507246255874634, "num_tokens": 3384109.0, "repeat_count": 0.0, "routers_loss": 0.02240442857146263, "skip_count": 2.0, "step": 2316, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0008808948120839927, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 3387213.0, "repeat_count": 0.0, "routers_loss": 0.004534904845058918, "skip_count": 0.0, "step": 2318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.166015625, "learning_rate": 0.0008806434381742671, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 3390219.0, "repeat_count": 0.0, "routers_loss": 0.0045533753000199795, "skip_count": 1.0, "step": 2320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.501092498179169, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.20703125, "learning_rate": 0.0008803918352262874, "loss": 0.0092, "macro_f1": 0.32863849401474, "num_tokens": 3392952.0, "repeat_count": 1.0, "routers_loss": 0.027193710207939148, "skip_count": 0.0, "step": 2322, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.512745812090314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.20703125, "learning_rate": 0.0008801400033914464, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 3396856.0, "repeat_count": 0.0, "routers_loss": 0.0009185796952806413, "skip_count": 0.0, "step": 2324, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.524399126001457, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.158203125, "learning_rate": 0.0008798879428212747, "loss": 0.0085, "macro_f1": 0.8837606906890869, "num_tokens": 3399239.0, "repeat_count": 2.0, "routers_loss": 0.03137107193470001, "skip_count": 2.0, "step": 2326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0008796356536674404, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 3402108.0, "repeat_count": 0.0, "routers_loss": 0.0016483605140820146, "skip_count": 0.0, "step": 2328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.547705753823744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0008793831360817495, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 3405018.0, "repeat_count": 0.0, "routers_loss": 0.0055032754316926, "skip_count": 0.0, "step": 2330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.559359067734887, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.26953125, "learning_rate": 0.0008791303902161449, "loss": 0.007, "macro_f1": 0.661835789680481, "num_tokens": 3407801.0, "repeat_count": 1.0, "routers_loss": 0.008590149693191051, "skip_count": 1.0, "step": 2332, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.20703125, "learning_rate": 0.0008788774162227074, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 3410627.0, "repeat_count": 0.0, "routers_loss": 0.003803118597716093, "skip_count": 0.0, "step": 2334, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 13.582665695557175, "f1_execute": 0.9850746393203735, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.2158203125, "learning_rate": 0.0008786242142536547, "loss": 0.0078, "macro_f1": 0.8839138746261597, "num_tokens": 3412953.0, "repeat_count": 2.0, "routers_loss": 0.013405604287981987, "skip_count": 1.0, "step": 2336, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 13.594319009468318, "f1_execute": 0.9841269850730896, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.357421875, "learning_rate": 0.0008783707844613419, "loss": 0.0109, "macro_f1": 0.6613757014274597, "num_tokens": 3415870.0, "repeat_count": 1.0, "routers_loss": 0.031492214649915695, "skip_count": 4.0, "step": 2338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0008781171269982611, "loss": 0.0089, "macro_f1": 0.3333333432674408, "num_tokens": 3418816.0, "repeat_count": 0.0, "routers_loss": 0.001053396612405777, "skip_count": 0.0, "step": 2340, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.617625637290605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.0008778632420170414, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 3421878.0, "repeat_count": 1.0, "routers_loss": 0.002767848549410701, "skip_count": 2.0, "step": 2342, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.26171875, "learning_rate": 0.0008776091296704488, "loss": 0.0095, "macro_f1": 0.3333333432674408, "num_tokens": 3424600.0, "repeat_count": 0.0, "routers_loss": 0.0007681563147343695, "skip_count": 0.0, "step": 2344, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.640932265112891, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1396484375, "learning_rate": 0.000877354790111386, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 3427251.0, "repeat_count": 1.0, "routers_loss": 0.00981063861399889, "skip_count": 2.0, "step": 2346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.652585579024034, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0008771002234928926, "loss": 0.0086, "macro_f1": 0.32863849401474, "num_tokens": 3430552.0, "repeat_count": 1.0, "routers_loss": 0.027748996391892433, "skip_count": 0.0, "step": 2348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 13.66423889293518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.197265625, "learning_rate": 0.0008768454299681448, "loss": 0.0089, "macro_f1": 0.6666666865348816, "num_tokens": 3433439.0, "repeat_count": 0.0, "routers_loss": 0.01183625590056181, "skip_count": 4.0, "step": 2350, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.675892206846322, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.203125, "learning_rate": 0.0008765904096904553, "loss": 0.0081, "macro_f1": 0.32863849401474, "num_tokens": 3436634.0, "repeat_count": 1.0, "routers_loss": 0.031146476045250893, "skip_count": 0.0, "step": 2352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.687545520757466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1875, "learning_rate": 0.000876335162813273, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 3439137.0, "repeat_count": 0.0, "routers_loss": 0.005278544966131449, "skip_count": 1.0, "step": 2354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.0008760796894901835, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 3441839.0, "repeat_count": 0.0, "routers_loss": 0.0015443433076143265, "skip_count": 2.0, "step": 2356, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 13.710852148579752, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.0008758239898749086, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 3444979.0, "repeat_count": 1.0, "routers_loss": 0.0031127736438065767, "skip_count": 0.0, "step": 2358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.000875568064121306, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 3447927.0, "repeat_count": 0.0, "routers_loss": 0.003809203626587987, "skip_count": 0.0, "step": 2360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 13.734158776402039, "f1_execute": 0.9677419066429138, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 0.26953125, "learning_rate": 0.0008753119123833697, "loss": 0.0092, "macro_f1": 0.6188769340515137, "num_tokens": 3450230.0, "repeat_count": 1.0, "routers_loss": 0.02130594477057457, "skip_count": 4.0, "step": 2362, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.745812090313184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.146484375, "learning_rate": 0.0008750555348152298, "loss": 0.0105, "macro_f1": 1.0, "num_tokens": 3453638.0, "repeat_count": 1.0, "routers_loss": 0.002106165513396263, "skip_count": 2.0, "step": 2364, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 13.757465404224327, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.16015625, "learning_rate": 0.000874798931571152, "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 3456070.0, "repeat_count": 1.0, "routers_loss": 0.005320910830050707, "skip_count": 0.0, "step": 2366, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 13.76911871813547, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.224609375, "learning_rate": 0.0008745421028055378, "loss": 0.0087, "macro_f1": 1.0, "num_tokens": 3458857.0, "repeat_count": 1.0, "routers_loss": 0.00395248644053936, "skip_count": 1.0, "step": 2368, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.780772032046613, "f1_execute": 0.9836065173149109, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.341796875, "learning_rate": 0.0008742850486729247, "loss": 0.0106, "macro_f1": 0.9278688430786133, "num_tokens": 3461537.0, "repeat_count": 3.0, "routers_loss": 0.09415233135223389, "skip_count": 3.0, "step": 2370, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0008740277693279854, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 3464509.0, "repeat_count": 0.0, "routers_loss": 0.0007302272133529186, "skip_count": 0.0, "step": 2372, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.0008737702649255287, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 3468896.0, "repeat_count": 0.0, "routers_loss": 0.003523432882502675, "skip_count": 0.0, "step": 2374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.815731973780043, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.263671875, "learning_rate": 0.000873512535620498, "loss": 0.0093, "macro_f1": 0.32863849401474, "num_tokens": 3471611.0, "repeat_count": 0.0, "routers_loss": 0.03664179891347885, "skip_count": 1.0, "step": 2376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 13.827385287691188, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1318359375, "learning_rate": 0.0008732545815679728, "loss": 0.017, "macro_f1": 0.5507246255874634, "num_tokens": 3474310.0, "repeat_count": 0.0, "routers_loss": 0.016573546454310417, "skip_count": 2.0, "step": 2378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.839038601602331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0008729964029231673, "loss": 0.0114, "macro_f1": 0.3333333432674408, "num_tokens": 3476794.0, "repeat_count": 0.0, "routers_loss": 0.0009162999340333045, "skip_count": 0.0, "step": 2380, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.162109375, "learning_rate": 0.0008727379998414311, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 3480248.0, "repeat_count": 0.0, "routers_loss": 0.0056904456578195095, "skip_count": 0.0, "step": 2382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.862345229424617, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.166015625, "learning_rate": 0.0008724793724782489, "loss": 0.0096, "macro_f1": 0.3333333432674408, "num_tokens": 3483539.0, "repeat_count": 0.0, "routers_loss": 0.0036526417825371027, "skip_count": 0.0, "step": 2384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 13.87399854333576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1708984375, "learning_rate": 0.0008722205209892402, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 3486343.0, "repeat_count": 0.0, "routers_loss": 0.005990063305944204, "skip_count": 3.0, "step": 2386, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.885651857246904, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.22265625, "learning_rate": 0.0008719614455301592, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 3488779.0, "repeat_count": 0.0, "routers_loss": 0.009350229986011982, "skip_count": 2.0, "step": 2388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 13.897305171158049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.263671875, "learning_rate": 0.0008717021462568955, "loss": 0.0113, "macro_f1": 0.6666666865348816, "num_tokens": 3491536.0, "repeat_count": 0.0, "routers_loss": 0.0025889824610203505, "skip_count": 2.0, "step": 2390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.48828125, "learning_rate": 0.0008714426233254726, "loss": 0.0101, "macro_f1": 0.3333333432674408, "num_tokens": 3494022.0, "repeat_count": 0.0, "routers_loss": 0.006271359510719776, "skip_count": 0.0, "step": 2392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.169921875, "learning_rate": 0.0008711828768920488, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 3496873.0, "repeat_count": 0.0, "routers_loss": 0.0062958430498838425, "skip_count": 1.0, "step": 2394, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.932265112891479, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.20703125, "learning_rate": 0.0008709229071129177, "loss": 0.0098, "macro_f1": 1.0, "num_tokens": 3499686.0, "repeat_count": 1.0, "routers_loss": 0.006420778576284647, "skip_count": 2.0, "step": 2396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0008706627141445058, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 3502694.0, "repeat_count": 0.0, "routers_loss": 0.002980271354317665, "skip_count": 0.0, "step": 2398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.000870402298143375, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 3505599.0, "repeat_count": 0.0, "routers_loss": 0.002412529196590185, "skip_count": 0.0, "step": 2400, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 13.967225054624908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.23046875, "learning_rate": 0.0008701416592662212, "loss": 0.0095, "macro_f1": 1.0, "num_tokens": 3508368.0, "repeat_count": 1.0, "routers_loss": 0.007215274032205343, "skip_count": 1.0, "step": 2402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 13.978878368536053, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.458984375, "learning_rate": 0.0008698807976698743, "loss": 0.0121, "macro_f1": 0.32863849401474, "num_tokens": 3511005.0, "repeat_count": 0.0, "routers_loss": 0.04310129955410957, "skip_count": 1.0, "step": 2404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 13.990531682447196, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.279296875, "learning_rate": 0.000869619713511298, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 3514060.0, "repeat_count": 0.0, "routers_loss": 0.0016910625854507089, "skip_count": 1.0, "step": 2406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.22265625, "learning_rate": 0.0008693584069475904, "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 3517024.0, "repeat_count": 0.0, "routers_loss": 0.00303131272085011, "skip_count": 1.0, "step": 2408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.142578125, "learning_rate": 0.000869096878135983, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 3520959.0, "repeat_count": 0.0, "routers_loss": 0.004226956516504288, "skip_count": 1.0, "step": 2410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.16015625, "learning_rate": 0.0008688351272338411, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 3523695.0, "repeat_count": 0.0, "routers_loss": 0.0011959103867411613, "skip_count": 0.0, "step": 2412, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 14.03495994173343, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.134765625, "learning_rate": 0.0008685731543986639, "loss": 0.0057, "macro_f1": 0.8837606906890869, "num_tokens": 3526558.0, "repeat_count": 2.0, "routers_loss": 0.01705942489206791, "skip_count": 2.0, "step": 2414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 14.046613255644575, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1884765625, "learning_rate": 0.000868310959788084, "loss": 0.0079, "macro_f1": 0.5507246255874634, "num_tokens": 3529243.0, "repeat_count": 0.0, "routers_loss": 0.012217274866998196, "skip_count": 2.0, "step": 2416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.058266569555718, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0008680485435598673, "loss": 0.0064, "macro_f1": 0.32863849401474, "num_tokens": 3532987.0, "repeat_count": 1.0, "routers_loss": 0.013472831808030605, "skip_count": 0.0, "step": 2418, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.069919883466861, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2080078125, "learning_rate": 0.000867785905871913, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 3535608.0, "repeat_count": 0.0, "routers_loss": 0.004113011062145233, "skip_count": 1.0, "step": 2420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 14.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2451171875, "learning_rate": 0.0008675230468822539, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 3538022.0, "repeat_count": 0.0, "routers_loss": 0.009245575405657291, "skip_count": 5.0, "step": 2422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.093226511289147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0008672599667490559, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 3540840.0, "repeat_count": 0.0, "routers_loss": 0.0013478371547535062, "skip_count": 0.0, "step": 2424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.10487982520029, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0008669966656306177, "loss": 0.0094, "macro_f1": 0.3333333432674408, "num_tokens": 3543744.0, "repeat_count": 0.0, "routers_loss": 0.002089953050017357, "skip_count": 0.0, "step": 2426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0008667331436853711, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 3546681.0, "repeat_count": 0.0, "routers_loss": 0.0035835886374115944, "skip_count": 0.0, "step": 2428, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.128186453022579, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.18359375, "learning_rate": 0.0008664694010718808, "loss": 0.0104, "macro_f1": 1.0, "num_tokens": 3549363.0, "repeat_count": 1.0, "routers_loss": 0.007169296499341726, "skip_count": 2.0, "step": 2430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12451171875, "learning_rate": 0.000866205437948844, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 3551571.0, "repeat_count": 0.0, "routers_loss": 0.0022611983586102724, "skip_count": 2.0, "step": 2432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0008659412544750913, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 3554655.0, "repeat_count": 0.0, "routers_loss": 0.0040114703588187695, "skip_count": 0.0, "step": 2434, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 14.163146394756009, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0008656768508095852, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 3557851.0, "repeat_count": 1.0, "routers_loss": 0.0024062488228082657, "skip_count": 0.0, "step": 2436, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 14.174799708667152, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1767578125, "learning_rate": 0.0008654122271114211, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 3560696.0, "repeat_count": 1.0, "routers_loss": 0.0007947225240059197, "skip_count": 0.0, "step": 2438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 14.186453022578295, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1435546875, "learning_rate": 0.0008651473835398262, "loss": 0.0084, "macro_f1": 0.5507246255874634, "num_tokens": 3563575.0, "repeat_count": 0.0, "routers_loss": 0.007512022275477648, "skip_count": 2.0, "step": 2440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 14.19810633648944, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2158203125, "learning_rate": 0.000864882320254161, "loss": 0.0073, "macro_f1": 0.5507246255874634, "num_tokens": 3566450.0, "repeat_count": 0.0, "routers_loss": 0.01589658483862877, "skip_count": 2.0, "step": 2442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.209759650400583, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.251953125, "learning_rate": 0.0008646170374139172, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 3570311.0, "repeat_count": 0.0, "routers_loss": 0.0021179458126425743, "skip_count": 0.0, "step": 2444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.221412964311726, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09375, "learning_rate": 0.0008643515351787192, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 3572809.0, "repeat_count": 0.0, "routers_loss": 0.0032154603395611048, "skip_count": 0.0, "step": 2446, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.23306627822287, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1591796875, "learning_rate": 0.0008640858137083232, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 3575403.0, "repeat_count": 1.0, "routers_loss": 0.004684223793447018, "skip_count": 3.0, "step": 2448, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.244719592134013, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0008638198731626173, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 3579697.0, "repeat_count": 0.0, "routers_loss": 0.005157069303095341, "skip_count": 2.0, "step": 2450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.256372906045156, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.28125, "learning_rate": 0.0008635537137016218, "loss": 0.0085, "macro_f1": 0.6616915464401245, "num_tokens": 3582713.0, "repeat_count": 1.0, "routers_loss": 0.01533245388418436, "skip_count": 2.0, "step": 2452, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 14.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1416015625, "learning_rate": 0.000863287335485488, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 3585266.0, "repeat_count": 1.0, "routers_loss": 0.0019299100385978818, "skip_count": 4.0, "step": 2454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1513671875, "learning_rate": 0.0008630207386744994, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 3588267.0, "repeat_count": 0.0, "routers_loss": 0.01078882534056902, "skip_count": 2.0, "step": 2456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.000862753923429071, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 3591180.0, "repeat_count": 0.0, "routers_loss": 0.0019160788506269455, "skip_count": 0.0, "step": 2458, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.30298616168973, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2197265625, "learning_rate": 0.0008624868899097491, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 3593629.0, "repeat_count": 2.0, "routers_loss": 0.004368767142295837, "skip_count": 4.0, "step": 2460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.107421875, "learning_rate": 0.000862219638277211, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 3597418.0, "repeat_count": 0.0, "routers_loss": 0.002857551909983158, "skip_count": 0.0, "step": 2462, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.326292789512017, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2314453125, "learning_rate": 0.000861952168692266, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 3600616.0, "repeat_count": 1.0, "routers_loss": 0.00803455337882042, "skip_count": 3.0, "step": 2464, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 14.33794610342316, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.000861684481315854, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 3603163.0, "repeat_count": 1.0, "routers_loss": 0.00889555737376213, "skip_count": 5.0, "step": 2466, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1298828125, "learning_rate": 0.000861416576309046, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 3606101.0, "repeat_count": 0.0, "routers_loss": 0.003707993309944868, "skip_count": 2.0, "step": 2468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.361252731245449, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1962890625, "learning_rate": 0.0008611484538330441, "loss": 0.0097, "macro_f1": 0.661835789680481, "num_tokens": 3609559.0, "repeat_count": 1.0, "routers_loss": 0.038305092602968216, "skip_count": 1.0, "step": 2470, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 14.372906045156592, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0008608801140491811, "loss": 0.0077, "macro_f1": 1.0, "num_tokens": 3612263.0, "repeat_count": 3.0, "routers_loss": 0.014152422547340393, "skip_count": 2.0, "step": 2472, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.384559359067735, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2734375, "learning_rate": 0.0008606115571189208, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 3614829.0, "repeat_count": 0.0, "routers_loss": 0.0014565002638846636, "skip_count": 0.0, "step": 2474, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.396212672978878, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1640625, "learning_rate": 0.0008603427832038573, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 3618008.0, "repeat_count": 0.0, "routers_loss": 0.004998714663088322, "skip_count": 0.0, "step": 2476, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.407865986890021, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.0008600737924657156, "loss": 0.012, "macro_f1": 0.32863849401474, "num_tokens": 3620903.0, "repeat_count": 1.0, "routers_loss": 0.0217443834990263, "skip_count": 0.0, "step": 2478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.419519300801165, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.24609375, "learning_rate": 0.0008598045850663511, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 3623301.0, "repeat_count": 0.0, "routers_loss": 0.003345140488818288, "skip_count": 1.0, "step": 2480, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 14.43117261471231, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1826171875, "learning_rate": 0.0008595351611677492, "loss": 0.008, "macro_f1": 0.8839138746261597, "num_tokens": 3626064.0, "repeat_count": 1.0, "routers_loss": 0.017985092476010323, "skip_count": 2.0, "step": 2482, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.442825928623453, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.24609375, "learning_rate": 0.0008592655209320261, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 3629004.0, "repeat_count": 2.0, "routers_loss": 0.005585973616689444, "skip_count": 3.0, "step": 2484, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 14.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2236328125, "learning_rate": 0.0008589956645214281, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 3631549.0, "repeat_count": 1.0, "routers_loss": 0.0018175052246078849, "skip_count": 0.0, "step": 2486, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 14.46613255644574, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0008587255920983312, "loss": 0.0109, "macro_f1": 0.32863849401474, "num_tokens": 3635078.0, "repeat_count": 0.0, "routers_loss": 0.00974923837929964, "skip_count": 0.0, "step": 2488, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2001953125, "learning_rate": 0.0008584553038252414, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 3637900.0, "repeat_count": 0.0, "routers_loss": 0.01146594900637865, "skip_count": 0.0, "step": 2490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 14.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.255859375, "learning_rate": 0.0008581847998647952, "loss": 0.0086, "macro_f1": 0.6666666865348816, "num_tokens": 3641137.0, "repeat_count": 0.0, "routers_loss": 0.00748372171074152, "skip_count": 4.0, "step": 2492, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 14.501092498179169, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.13671875, "learning_rate": 0.0008579140803797581, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 3644205.0, "repeat_count": 1.0, "routers_loss": 0.0006479470175690949, "skip_count": 0.0, "step": 2494, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 14.512745812090314, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.287109375, "learning_rate": 0.0008576431455330258, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 3647138.0, "repeat_count": 1.0, "routers_loss": 0.0006546055665239692, "skip_count": 0.0, "step": 2496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.524399126001457, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2294921875, "learning_rate": 0.0008573719954876233, "loss": 0.0087, "macro_f1": 0.32863849401474, "num_tokens": 3650039.0, "repeat_count": 0.0, "routers_loss": 0.010899928398430347, "skip_count": 1.0, "step": 2498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 14.5360524399126, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.3359375, "learning_rate": 0.0008571006304067055, "loss": 0.0115, "macro_f1": 0.5507246255874634, "num_tokens": 3652649.0, "repeat_count": 0.0, "routers_loss": 0.007886072620749474, "skip_count": 2.0, "step": 2500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.547705753823744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0008568290504535563, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 3655348.0, "repeat_count": 0.0, "routers_loss": 0.0015899145510047674, "skip_count": 0.0, "step": 2502, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.559359067734887, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.0008565572557915888, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 3658111.0, "repeat_count": 0.0, "routers_loss": 0.0030766960699111223, "skip_count": 0.0, "step": 2504, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.0008562852465843456, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 3661255.0, "repeat_count": 0.0, "routers_loss": 0.005691307131201029, "skip_count": 2.0, "step": 2506, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 14.582665695557175, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11181640625, "learning_rate": 0.0008560130229954983, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 3663979.0, "repeat_count": 1.0, "routers_loss": 0.005326156970113516, "skip_count": 1.0, "step": 2508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 14.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.16796875, "learning_rate": 0.0008557405851888475, "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 3666707.0, "repeat_count": 0.0, "routers_loss": 0.007061026990413666, "skip_count": 3.0, "step": 2510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12255859375, "learning_rate": 0.0008554679333283227, "loss": 0.0147, "macro_f1": 0.6666666865348816, "num_tokens": 3669577.0, "repeat_count": 0.0, "routers_loss": 0.0057168155908584595, "skip_count": 2.0, "step": 2512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.0008551950675779818, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 3673129.0, "repeat_count": 0.0, "routers_loss": 0.0034288119059056044, "skip_count": 0.0, "step": 2514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.298828125, "learning_rate": 0.0008549219881020125, "loss": 0.01, "macro_f1": 0.6666666865348816, "num_tokens": 3676817.0, "repeat_count": 0.0, "routers_loss": 0.0040129018016159534, "skip_count": 1.0, "step": 2516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2470703125, "learning_rate": 0.0008546486950647298, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 3679409.0, "repeat_count": 0.0, "routers_loss": 0.0014133101794868708, "skip_count": 2.0, "step": 2518, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 14.652585579024034, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2392578125, "learning_rate": 0.0008543751886305779, "loss": 0.0078, "macro_f1": 1.0, "num_tokens": 3682356.0, "repeat_count": 1.0, "routers_loss": 0.003889523446559906, "skip_count": 4.0, "step": 2520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.66423889293518, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1611328125, "learning_rate": 0.0008541014689641294, "loss": 0.0103, "macro_f1": 0.32863849401474, "num_tokens": 3685033.0, "repeat_count": 0.0, "routers_loss": 0.024124711751937866, "skip_count": 1.0, "step": 2522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.675892206846322, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.000853827536230085, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 3688130.0, "repeat_count": 0.0, "routers_loss": 0.0012442971346899867, "skip_count": 0.0, "step": 2524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.687545520757466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.359375, "learning_rate": 0.0008535533905932737, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 3690992.0, "repeat_count": 0.0, "routers_loss": 0.002609512535855174, "skip_count": 0.0, "step": 2526, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.232421875, "learning_rate": 0.0008532790322186527, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 3694967.0, "repeat_count": 0.0, "routers_loss": 0.007062798831611872, "skip_count": 2.0, "step": 2528, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 14.710852148579752, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1689453125, "learning_rate": 0.000853004461271307, "loss": 0.0123, "macro_f1": 0.545751690864563, "num_tokens": 3697824.0, "repeat_count": 1.0, "routers_loss": 0.021207040175795555, "skip_count": 2.0, "step": 2530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.16015625, "learning_rate": 0.0008527296779164497, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 3700693.0, "repeat_count": 0.0, "routers_loss": 0.002124079270288348, "skip_count": 0.0, "step": 2532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.734158776402039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0008524546823194216, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 3703289.0, "repeat_count": 0.0, "routers_loss": 0.001182463252916932, "skip_count": 0.0, "step": 2534, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.745812090313184, "f1_execute": 0.9841269850730896, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.11572265625, "learning_rate": 0.0008521794746456911, "loss": 0.0054, "macro_f1": 0.8835979700088501, "num_tokens": 3706353.0, "repeat_count": 2.0, "routers_loss": 0.03521158918738365, "skip_count": 3.0, "step": 2536, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 33.0, "epoch": 14.757465404224327, "f1_execute": 0.9629629850387573, "f1_repeat": 1.0, "f1_skip": 0.8333333134651184, "grad_norm": 0.1494140625, "learning_rate": 0.0008519040550608545, "loss": 0.0073, "macro_f1": 0.9320987462997437, "num_tokens": 3709352.0, "repeat_count": 3.0, "routers_loss": 0.03660592436790466, "skip_count": 6.0, "step": 2538, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19921875, "learning_rate": 0.0008516284237306354, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 3712560.0, "repeat_count": 0.0, "routers_loss": 0.006190510466694832, "skip_count": 0.0, "step": 2540, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 14.780772032046613, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2197265625, "learning_rate": 0.0008513525808208849, "loss": 0.0117, "macro_f1": 1.0, "num_tokens": 3715212.0, "repeat_count": 2.0, "routers_loss": 0.004166349768638611, "skip_count": 1.0, "step": 2542, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.792425345957756, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.322265625, "learning_rate": 0.0008510765264975813, "loss": 0.0084, "macro_f1": 1.0, "num_tokens": 3718369.0, "repeat_count": 1.0, "routers_loss": 0.005671303253620863, "skip_count": 3.0, "step": 2544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.095703125, "learning_rate": 0.0008508002609268301, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 3721170.0, "repeat_count": 0.0, "routers_loss": 0.0027014475781470537, "skip_count": 2.0, "step": 2546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.115234375, "learning_rate": 0.0008505237842748643, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 3723732.0, "repeat_count": 0.0, "routers_loss": 0.006105746142566204, "skip_count": 1.0, "step": 2548, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 14.827385287691188, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1796875, "learning_rate": 0.0008502470967080433, "loss": 0.0088, "macro_f1": 0.6666666865348816, "num_tokens": 3726581.0, "repeat_count": 0.0, "routers_loss": 0.006461447570472956, "skip_count": 3.0, "step": 2550, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 14.839038601602331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.255859375, "learning_rate": 0.0008499701983928538, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 3729542.0, "repeat_count": 0.0, "routers_loss": 0.004744027741253376, "skip_count": 2.0, "step": 2552, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 14.850691915513474, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1474609375, "learning_rate": 0.0008496930894959094, "loss": 0.0081, "macro_f1": 0.5950249433517456, "num_tokens": 3732321.0, "repeat_count": 0.0, "routers_loss": 0.009049718268215656, "skip_count": 3.0, "step": 2554, "text_loss": 0.0 }, { "acc_repeat": 0.8333333134651184, "acc_skip": 1.0, "avg_layers": 40.0, "epoch": 14.862345229424617, "f1_execute": 0.9830508232116699, "f1_repeat": 0.9090909361839294, "f1_skip": 1.0, "grad_norm": 0.1162109375, "learning_rate": 0.00084941577018395, "loss": 0.0074, "macro_f1": 0.9640473127365112, "num_tokens": 3735419.0, "repeat_count": 6.0, "routers_loss": 0.02212626487016678, "skip_count": 1.0, "step": 2556, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.87399854333576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0008491382406238425, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 3738866.0, "repeat_count": 0.0, "routers_loss": 0.0007654703804291785, "skip_count": 0.0, "step": 2558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 14.885651857246904, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.265625, "learning_rate": 0.0008488605009825801, "loss": 0.0097, "macro_f1": 0.6666666865348816, "num_tokens": 3741506.0, "repeat_count": 0.0, "routers_loss": 0.007666799705475569, "skip_count": 3.0, "step": 2560, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.897305171158049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0008485825514272824, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 3744264.0, "repeat_count": 0.0, "routers_loss": 0.001033568405546248, "skip_count": 0.0, "step": 2562, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 14.908958485069192, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0008483043921251954, "loss": 0.0066, "macro_f1": 0.6616915464401245, "num_tokens": 3746837.0, "repeat_count": 2.0, "routers_loss": 0.02219482697546482, "skip_count": 1.0, "step": 2564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2353515625, "learning_rate": 0.0008480260232436911, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 3749999.0, "repeat_count": 0.0, "routers_loss": 0.001898279064334929, "skip_count": 0.0, "step": 2566, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6000000238418579, "avg_layers": 34.0, "epoch": 14.932265112891479, "f1_execute": 0.9677419066429138, "f1_repeat": 1.0, "f1_skip": 0.75, "grad_norm": 0.1669921875, "learning_rate": 0.0008477474449502682, "loss": 0.0097, "macro_f1": 0.9059140086174011, "num_tokens": 3752608.0, "repeat_count": 1.0, "routers_loss": 0.02287086844444275, "skip_count": 5.0, "step": 2568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1513671875, "learning_rate": 0.0008474686574125508, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 3754902.0, "repeat_count": 0.0, "routers_loss": 0.004794036969542503, "skip_count": 1.0, "step": 2570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 14.955571740713765, "f1_execute": 0.95652174949646, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.000847189660798289, "loss": 0.0064, "macro_f1": 0.3188405930995941, "num_tokens": 3757621.0, "repeat_count": 0.0, "routers_loss": 0.024910759180784225, "skip_count": 3.0, "step": 2572, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 32.0, "epoch": 14.967225054624908, "f1_execute": 0.9830508232116699, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.140625, "learning_rate": 0.0008469104552753588, "loss": 0.0112, "macro_f1": 0.9640473127365112, "num_tokens": 3760279.0, "repeat_count": 1.0, "routers_loss": 0.016381841152906418, "skip_count": 6.0, "step": 2574, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 14.978878368536053, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.0008466310410117622, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 3763002.0, "repeat_count": 1.0, "routers_loss": 0.0068369111977517605, "skip_count": 1.0, "step": 2576, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 14.990531682447196, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1240234375, "learning_rate": 0.0008463514181756261, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 3765637.0, "repeat_count": 1.0, "routers_loss": 0.0023549161851406097, "skip_count": 2.0, "step": 2578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11767578125, "learning_rate": 0.0008460715869352035, "loss": 0.0083, "macro_f1": 0.3333333432674408, "num_tokens": 3768240.0, "repeat_count": 0.0, "routers_loss": 0.003136478364467621, "skip_count": 0.0, "step": 2580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 15.011653313911143, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.091796875, "learning_rate": 0.0008457915474588724, "loss": 0.0086, "macro_f1": 0.5507246255874634, "num_tokens": 3770977.0, "repeat_count": 0.0, "routers_loss": 0.015323466621339321, "skip_count": 2.0, "step": 2582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 15.023306627822286, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.166015625, "learning_rate": 0.0008455112999151366, "loss": 0.0079, "macro_f1": 0.4901960790157318, "num_tokens": 3774004.0, "repeat_count": 0.0, "routers_loss": 0.017231740057468414, "skip_count": 3.0, "step": 2584, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 15.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.248046875, "learning_rate": 0.0008452308444726248, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 3776898.0, "repeat_count": 3.0, "routers_loss": 0.011201013810932636, "skip_count": 2.0, "step": 2586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.046613255644575, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2578125, "learning_rate": 0.0008449501813000906, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 3779945.0, "repeat_count": 0.0, "routers_loss": 0.002790085505694151, "skip_count": 3.0, "step": 2588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.058266569555718, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.0008446693105664129, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3783024.0, "repeat_count": 0.0, "routers_loss": 0.002746149431914091, "skip_count": 0.0, "step": 2590, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 15.069919883466861, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.296875, "learning_rate": 0.0008443882324405954, "loss": 0.0093, "macro_f1": 0.928205132484436, "num_tokens": 3785824.0, "repeat_count": 1.0, "routers_loss": 0.023209640756249428, "skip_count": 3.0, "step": 2592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.081573197378004, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11572265625, "learning_rate": 0.0008441069470917664, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 3788887.0, "repeat_count": 0.0, "routers_loss": 0.0020308184903115034, "skip_count": 1.0, "step": 2594, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.093226511289147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12109375, "learning_rate": 0.0008438254546891792, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 3792841.0, "repeat_count": 0.0, "routers_loss": 0.008654826320707798, "skip_count": 2.0, "step": 2596, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.10487982520029, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0008435437554022115, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 3795300.0, "repeat_count": 1.0, "routers_loss": 0.0067406026646494865, "skip_count": 0.0, "step": 2598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.0008432618494003657, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 3797768.0, "repeat_count": 0.0, "routers_loss": 0.003471480682492256, "skip_count": 2.0, "step": 2600, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.128186453022579, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1484375, "learning_rate": 0.0008429797368532681, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 3801917.0, "repeat_count": 1.0, "routers_loss": 0.004844320472329855, "skip_count": 2.0, "step": 2602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 15.139839766933722, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.2021484375, "learning_rate": 0.0008426974179306699, "loss": 0.0088, "macro_f1": 0.6139194369316101, "num_tokens": 3805575.0, "repeat_count": 0.0, "routers_loss": 0.009143726900219917, "skip_count": 4.0, "step": 2604, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.000842414892802446, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 3808389.0, "repeat_count": 0.0, "routers_loss": 0.0048062680289149284, "skip_count": 0.0, "step": 2606, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.163146394756009, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2177734375, "learning_rate": 0.0008421321616385957, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 3811741.0, "repeat_count": 1.0, "routers_loss": 0.009715022519230843, "skip_count": 2.0, "step": 2608, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.174799708667152, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.000841849224609242, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 3814731.0, "repeat_count": 0.0, "routers_loss": 0.0031659468077123165, "skip_count": 0.0, "step": 2610, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.0008415660818846318, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 3817502.0, "repeat_count": 0.0, "routers_loss": 0.0067526837810873985, "skip_count": 2.0, "step": 2612, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 15.19810633648944, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1552734375, "learning_rate": 0.0008412827336351361, "loss": 0.0077, "macro_f1": 0.928205132484436, "num_tokens": 3820182.0, "repeat_count": 1.0, "routers_loss": 0.00592645350843668, "skip_count": 3.0, "step": 2614, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.209759650400583, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0008409991800312493, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 3823168.0, "repeat_count": 1.0, "routers_loss": 0.005980054382234812, "skip_count": 4.0, "step": 2616, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 15.221412964311726, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12451171875, "learning_rate": 0.0008407154212435893, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 3826128.0, "repeat_count": 1.0, "routers_loss": 0.01055466290563345, "skip_count": 1.0, "step": 2618, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10693359375, "learning_rate": 0.0008404314574428975, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 3828733.0, "repeat_count": 0.0, "routers_loss": 0.0042555127292871475, "skip_count": 0.0, "step": 2620, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 15.244719592134013, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06689453125, "learning_rate": 0.000840147288800039, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 3831709.0, "repeat_count": 4.0, "routers_loss": 0.009978566318750381, "skip_count": 4.0, "step": 2622, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 31.0, "epoch": 15.256372906045156, "f1_execute": 0.9836065173149109, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.140625, "learning_rate": 0.0008398629154860016, "loss": 0.006, "macro_f1": 0.63089919090271, "num_tokens": 3834627.0, "repeat_count": 0.0, "routers_loss": 0.017729895189404488, "skip_count": 6.0, "step": 2624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1171875, "learning_rate": 0.0008395783376718966, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 3837451.0, "repeat_count": 0.0, "routers_loss": 0.00817107129842043, "skip_count": 3.0, "step": 2626, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 15.279679533867444, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1357421875, "learning_rate": 0.0008392935555289584, "loss": 0.0078, "macro_f1": 0.5507246255874634, "num_tokens": 3840099.0, "repeat_count": 0.0, "routers_loss": 0.006787733640521765, "skip_count": 2.0, "step": 2628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0008390085692285441, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 3842773.0, "repeat_count": 0.0, "routers_loss": 0.0025574115570634604, "skip_count": 0.0, "step": 2630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.140625, "learning_rate": 0.0008387233789421338, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 3845510.0, "repeat_count": 0.0, "routers_loss": 0.0067926873452961445, "skip_count": 1.0, "step": 2632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08544921875, "learning_rate": 0.0008384379848413304, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 3848216.0, "repeat_count": 0.0, "routers_loss": 0.001872074673883617, "skip_count": 1.0, "step": 2634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.326292789512017, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1103515625, "learning_rate": 0.0008381523870978592, "loss": 0.0076, "macro_f1": 0.5950249433517456, "num_tokens": 3851438.0, "repeat_count": 0.0, "routers_loss": 0.021905923262238503, "skip_count": 2.0, "step": 2636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0008378665858835684, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 3854121.0, "repeat_count": 0.0, "routers_loss": 0.0016704845475032926, "skip_count": 0.0, "step": 2638, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.091796875, "learning_rate": 0.0008375805813704282, "loss": 0.0092, "macro_f1": 0.3333333432674408, "num_tokens": 3857377.0, "repeat_count": 0.0, "routers_loss": 0.0020611535292118788, "skip_count": 0.0, "step": 2640, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.361252731245449, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1083984375, "learning_rate": 0.0008372943737305314, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 3859998.0, "repeat_count": 0.0, "routers_loss": 0.013227760791778564, "skip_count": 3.0, "step": 2642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.372906045156592, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0008370079631360931, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 3862424.0, "repeat_count": 0.0, "routers_loss": 0.0016974929021671414, "skip_count": 0.0, "step": 2644, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.384559359067735, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.20703125, "learning_rate": 0.0008367213497594501, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 3864971.0, "repeat_count": 5.0, "routers_loss": 0.003154122270643711, "skip_count": 8.0, "step": 2646, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.396212672978878, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1455078125, "learning_rate": 0.0008364345337730619, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 3867481.0, "repeat_count": 0.0, "routers_loss": 0.004621668253093958, "skip_count": 2.0, "step": 2648, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.407865986890021, "f1_execute": 0.9841269850730896, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.20703125, "learning_rate": 0.0008361475153495092, "loss": 0.0107, "macro_f1": 0.8835979700088501, "num_tokens": 3871041.0, "repeat_count": 2.0, "routers_loss": 0.028709402307868004, "skip_count": 3.0, "step": 2650, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 15.419519300801165, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1044921875, "learning_rate": 0.0008358602946614952, "loss": 0.0071, "macro_f1": 1.0, "num_tokens": 3873989.0, "repeat_count": 1.0, "routers_loss": 0.00761241652071476, "skip_count": 1.0, "step": 2652, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 15.43117261471231, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.11181640625, "learning_rate": 0.000835572871881844, "loss": 0.0072, "macro_f1": 0.928205132484436, "num_tokens": 3877391.0, "repeat_count": 1.0, "routers_loss": 0.021020544692873955, "skip_count": 3.0, "step": 2654, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.442825928623453, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.306640625, "learning_rate": 0.0008352852471835018, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 3880062.0, "repeat_count": 1.0, "routers_loss": 0.0015103090554475784, "skip_count": 0.0, "step": 2656, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0008349974207395365, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 3883397.0, "repeat_count": 1.0, "routers_loss": 0.0020445946138352156, "skip_count": 0.0, "step": 2658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.46613255644574, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1591796875, "learning_rate": 0.0008347093927231371, "loss": 0.0086, "macro_f1": 0.32863849401474, "num_tokens": 3886371.0, "repeat_count": 0.0, "routers_loss": 0.011711549945175648, "skip_count": 1.0, "step": 2660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.477785870356882, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.0008344211633076135, "loss": 0.0049, "macro_f1": 0.32863849401474, "num_tokens": 3889417.0, "repeat_count": 0.0, "routers_loss": 0.021929766982793808, "skip_count": 1.0, "step": 2662, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1455078125, "learning_rate": 0.0008341327326663976, "loss": 0.0087, "macro_f1": 0.6666666865348816, "num_tokens": 3891985.0, "repeat_count": 0.0, "routers_loss": 0.00193791592027992, "skip_count": 2.0, "step": 2664, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.501092498179169, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.000833844100973042, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 3894280.0, "repeat_count": 0.0, "routers_loss": 0.0009432074730284512, "skip_count": 0.0, "step": 2666, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.512745812090314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1708984375, "learning_rate": 0.0008335552684012201, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 3897117.0, "repeat_count": 0.0, "routers_loss": 0.004018259700387716, "skip_count": 3.0, "step": 2668, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.524399126001457, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.0008332662351247263, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 3900837.0, "repeat_count": 0.0, "routers_loss": 0.0017553855432197452, "skip_count": 0.0, "step": 2670, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 15.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11962890625, "learning_rate": 0.0008329770013174758, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 3903893.0, "repeat_count": 2.0, "routers_loss": 0.005385991185903549, "skip_count": 2.0, "step": 2672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.547705753823744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.181640625, "learning_rate": 0.0008326875671535045, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 3906440.0, "repeat_count": 0.0, "routers_loss": 0.0024922804441303015, "skip_count": 2.0, "step": 2674, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.559359067734887, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.162109375, "learning_rate": 0.0008323979328069688, "loss": 0.0075, "macro_f1": 0.661835789680481, "num_tokens": 3909248.0, "repeat_count": 1.0, "routers_loss": 0.02391377091407776, "skip_count": 1.0, "step": 2676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.57101238164603, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.220703125, "learning_rate": 0.0008321080984521459, "loss": 0.0076, "macro_f1": 0.6616915464401245, "num_tokens": 3912200.0, "repeat_count": 1.0, "routers_loss": 0.013289694674313068, "skip_count": 2.0, "step": 2678, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.582665695557175, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1376953125, "learning_rate": 0.0008318180642634324, "loss": 0.0065, "macro_f1": 1.0, "num_tokens": 3914989.0, "repeat_count": 2.0, "routers_loss": 0.0034987872932106256, "skip_count": 4.0, "step": 2680, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.594319009468318, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08447265625, "learning_rate": 0.000831527830415346, "loss": 0.0082, "macro_f1": 1.0, "num_tokens": 3917921.0, "repeat_count": 1.0, "routers_loss": 0.004313867539167404, "skip_count": 2.0, "step": 2682, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.605972323379461, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.050048828125, "learning_rate": 0.0008312373970825245, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 3921143.0, "repeat_count": 0.0, "routers_loss": 0.008675304241478443, "skip_count": 3.0, "step": 2684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0008309467644397253, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 3923798.0, "repeat_count": 0.0, "routers_loss": 0.004471088293939829, "skip_count": 0.0, "step": 2686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1962890625, "learning_rate": 0.0008306559326618259, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 3926798.0, "repeat_count": 0.0, "routers_loss": 0.0022357115522027016, "skip_count": 0.0, "step": 2688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.640932265112891, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2099609375, "learning_rate": 0.0008303649019238239, "loss": 0.0142, "macro_f1": 0.6666666865348816, "num_tokens": 3930148.0, "repeat_count": 0.0, "routers_loss": 0.0027154197450727224, "skip_count": 1.0, "step": 2690, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.652585579024034, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.0008300736724008365, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 3933746.0, "repeat_count": 1.0, "routers_loss": 0.0026417619083076715, "skip_count": 0.0, "step": 2692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.66423889293518, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.15234375, "learning_rate": 0.0008297822442680999, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 3936441.0, "repeat_count": 0.0, "routers_loss": 0.007177150342613459, "skip_count": 2.0, "step": 2694, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 15.675892206846322, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1416015625, "learning_rate": 0.0008294906177009707, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 3939301.0, "repeat_count": 2.0, "routers_loss": 0.004311054013669491, "skip_count": 2.0, "step": 2696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.687545520757466, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0008291987928749244, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 3942077.0, "repeat_count": 0.0, "routers_loss": 0.0006975915166549385, "skip_count": 0.0, "step": 2698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.699198834668609, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1396484375, "learning_rate": 0.000828906769965556, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 3945350.0, "repeat_count": 0.0, "routers_loss": 0.0035494177136570215, "skip_count": 2.0, "step": 2700, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.710852148579752, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.212890625, "learning_rate": 0.0008286145491485792, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 3948053.0, "repeat_count": 1.0, "routers_loss": 0.008176249451935291, "skip_count": 3.0, "step": 2702, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.722505462490895, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.33203125, "learning_rate": 0.0008283221305998279, "loss": 0.0096, "macro_f1": 0.6666666865348816, "num_tokens": 3950799.0, "repeat_count": 0.0, "routers_loss": 0.007261097431182861, "skip_count": 2.0, "step": 2704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.734158776402039, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0008280295144952537, "loss": 0.0098, "macro_f1": 0.3333333432674408, "num_tokens": 3953798.0, "repeat_count": 0.0, "routers_loss": 0.001698907115496695, "skip_count": 0.0, "step": 2706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.275390625, "learning_rate": 0.0008277367010109278, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 3956953.0, "repeat_count": 0.0, "routers_loss": 0.0007359234150499105, "skip_count": 0.0, "step": 2708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 15.757465404224327, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2138671875, "learning_rate": 0.0008274436903230398, "loss": 0.0067, "macro_f1": 0.6615384817123413, "num_tokens": 3960458.0, "repeat_count": 1.0, "routers_loss": 0.01967126503586769, "skip_count": 3.0, "step": 2710, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.76911871813547, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2578125, "learning_rate": 0.0008271504826078987, "loss": 0.007, "macro_f1": 0.32863849401474, "num_tokens": 3963090.0, "repeat_count": 0.0, "routers_loss": 0.01369393803179264, "skip_count": 1.0, "step": 2712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2109375, "learning_rate": 0.0008268570780419306, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 3966042.0, "repeat_count": 0.0, "routers_loss": 0.002660051453858614, "skip_count": 0.0, "step": 2714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0008265634768016819, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 3969103.0, "repeat_count": 0.0, "routers_loss": 0.0012149540707468987, "skip_count": 0.0, "step": 2716, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.000826269679063816, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 3972045.0, "repeat_count": 0.0, "routers_loss": 0.000957841461058706, "skip_count": 0.0, "step": 2718, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.815731973780043, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.0008259756850051148, "loss": 0.0071, "macro_f1": 0.661835789680481, "num_tokens": 3975276.0, "repeat_count": 1.0, "routers_loss": 0.012067343108355999, "skip_count": 1.0, "step": 2720, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 15.827385287691188, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1748046875, "learning_rate": 0.0008256814948024786, "loss": 0.0093, "macro_f1": 1.0, "num_tokens": 3978248.0, "repeat_count": 2.0, "routers_loss": 0.0033860974945127964, "skip_count": 1.0, "step": 2722, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.839038601602331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.0008253871086329255, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 3981209.0, "repeat_count": 0.0, "routers_loss": 0.001102686277590692, "skip_count": 0.0, "step": 2724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 15.850691915513474, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2119140625, "learning_rate": 0.0008250925266735918, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 3984095.0, "repeat_count": 0.0, "routers_loss": 0.0038591218180954456, "skip_count": 1.0, "step": 2726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.862345229424617, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.107421875, "learning_rate": 0.0008247977491017311, "loss": 0.0067, "macro_f1": 0.32380953431129456, "num_tokens": 3987745.0, "repeat_count": 0.0, "routers_loss": 0.024927029386162758, "skip_count": 2.0, "step": 2728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.87399854333576, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0008245027760947154, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 3990682.0, "repeat_count": 0.0, "routers_loss": 0.00199747271835804, "skip_count": 0.0, "step": 2730, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 15.885651857246904, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.0008242076078300336, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 3993446.0, "repeat_count": 2.0, "routers_loss": 0.0057619730941951275, "skip_count": 2.0, "step": 2732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.897305171158049, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1953125, "learning_rate": 0.0008239122444852926, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 3995941.0, "repeat_count": 0.0, "routers_loss": 0.00512414425611496, "skip_count": 0.0, "step": 2734, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11767578125, "learning_rate": 0.0008236166862382162, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 3998516.0, "repeat_count": 0.0, "routers_loss": 0.0006163757643662393, "skip_count": 0.0, "step": 2736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12060546875, "learning_rate": 0.000823320933266646, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 4001197.0, "repeat_count": 0.0, "routers_loss": 0.0011459601810202003, "skip_count": 2.0, "step": 2738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 15.932265112891479, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.33203125, "learning_rate": 0.0008230249857485405, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 4003411.0, "repeat_count": 0.0, "routers_loss": 0.003660835325717926, "skip_count": 2.0, "step": 2740, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.943918426802622, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0008227288438619753, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4006257.0, "repeat_count": 0.0, "routers_loss": 0.0010931146098300815, "skip_count": 0.0, "step": 2742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.955571740713765, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1162109375, "learning_rate": 0.0008224325077851429, "loss": 0.0075, "macro_f1": 0.32863849401474, "num_tokens": 4008600.0, "repeat_count": 0.0, "routers_loss": 0.014521141536533833, "skip_count": 1.0, "step": 2744, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 15.967225054624908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.16015625, "learning_rate": 0.0008221359776963525, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 4011902.0, "repeat_count": 1.0, "routers_loss": 0.002924735890701413, "skip_count": 0.0, "step": 2746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.978878368536053, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.318359375, "learning_rate": 0.0008218392537740304, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 4014557.0, "repeat_count": 0.0, "routers_loss": 0.0005848329165019095, "skip_count": 0.0, "step": 2748, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 15.990531682447196, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2275390625, "learning_rate": 0.0008215423361967196, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4017014.0, "repeat_count": 0.0, "routers_loss": 0.0008271581609733403, "skip_count": 0.0, "step": 2750, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0008212452251430788, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 4019456.0, "repeat_count": 1.0, "routers_loss": 0.002749481238424778, "skip_count": 2.0, "step": 2752, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 39.0, "epoch": 16.011653313911143, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11767578125, "learning_rate": 0.0008209479207918844, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 4022548.0, "repeat_count": 3.0, "routers_loss": 0.009314796887338161, "skip_count": 0.0, "step": 2754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1162109375, "learning_rate": 0.0008206504233220277, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 4025236.0, "repeat_count": 0.0, "routers_loss": 0.006683574989438057, "skip_count": 0.0, "step": 2756, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.03495994173343, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0008203527329125176, "loss": 0.006, "macro_f1": 0.32863849401474, "num_tokens": 4028456.0, "repeat_count": 1.0, "routers_loss": 0.031460996717214584, "skip_count": 0.0, "step": 2758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.076171875, "learning_rate": 0.0008200548497424778, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 4031130.0, "repeat_count": 0.0, "routers_loss": 0.003567702369764447, "skip_count": 0.0, "step": 2760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.058266569555716, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0008197567739911489, "loss": 0.0085, "macro_f1": 0.32863849401474, "num_tokens": 4034693.0, "repeat_count": 0.0, "routers_loss": 0.009648723527789116, "skip_count": 1.0, "step": 2762, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 16.06991988346686, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.298828125, "learning_rate": 0.0008194585058378871, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 4037321.0, "repeat_count": 1.0, "routers_loss": 0.00225069303996861, "skip_count": 0.0, "step": 2764, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.081573197378006, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08837890625, "learning_rate": 0.0008191600454621642, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 4039653.0, "repeat_count": 1.0, "routers_loss": 0.0023299299646168947, "skip_count": 2.0, "step": 2766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11328125, "learning_rate": 0.0008188613930435679, "loss": 0.0084, "macro_f1": 0.3333333432674408, "num_tokens": 4042204.0, "repeat_count": 0.0, "routers_loss": 0.0013161961687728763, "skip_count": 0.0, "step": 2768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.0008185625487618014, "loss": 0.0128, "macro_f1": 0.3333333432674408, "num_tokens": 4045212.0, "repeat_count": 0.0, "routers_loss": 0.0006326966104097664, "skip_count": 0.0, "step": 2770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2197265625, "learning_rate": 0.0008182635127966831, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4047941.0, "repeat_count": 0.0, "routers_loss": 0.0015066665364429355, "skip_count": 0.0, "step": 2772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0008179642853281471, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4050894.0, "repeat_count": 0.0, "routers_loss": 0.005050297360867262, "skip_count": 0.0, "step": 2774, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 16.139839766933722, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0008176648665362425, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 4053848.0, "repeat_count": 1.0, "routers_loss": 0.0024434153456240892, "skip_count": 0.0, "step": 2776, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1787109375, "learning_rate": 0.0008173652566011338, "loss": 0.0102, "macro_f1": 0.3333333432674408, "num_tokens": 4056831.0, "repeat_count": 0.0, "routers_loss": 0.0029241496231406927, "skip_count": 0.0, "step": 2778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.28125, "learning_rate": 0.0008170654557031002, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4060137.0, "repeat_count": 0.0, "routers_loss": 0.0056282225996255875, "skip_count": 0.0, "step": 2780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0008167654640225357, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 4063160.0, "repeat_count": 0.0, "routers_loss": 0.002159922616556287, "skip_count": 2.0, "step": 2782, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09912109375, "learning_rate": 0.0008164652817399496, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4066314.0, "repeat_count": 0.0, "routers_loss": 0.0014264502096921206, "skip_count": 2.0, "step": 2784, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1708984375, "learning_rate": 0.0008161649090359655, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 4068498.0, "repeat_count": 0.0, "routers_loss": 0.004046193789690733, "skip_count": 1.0, "step": 2786, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 16.20975965040058, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0859375, "learning_rate": 0.0008158643460913216, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 4071812.0, "repeat_count": 1.0, "routers_loss": 0.00575997494161129, "skip_count": 1.0, "step": 2788, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0008155635930868708, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 4074608.0, "repeat_count": 0.0, "routers_loss": 0.0008747714455239475, "skip_count": 0.0, "step": 2790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.16796875, "learning_rate": 0.0008152626502035803, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 4077759.0, "repeat_count": 0.0, "routers_loss": 0.0010953082237392664, "skip_count": 0.0, "step": 2792, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 16.244719592134015, "f1_execute": 0.9473683834075928, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.12890625, "learning_rate": 0.000814961517622531, "loss": 0.0066, "macro_f1": 0.871345043182373, "num_tokens": 4080444.0, "repeat_count": 3.0, "routers_loss": 0.0483657605946064, "skip_count": 6.0, "step": 2794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.256372906045158, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.20703125, "learning_rate": 0.0008146601955249188, "loss": 0.0067, "macro_f1": 0.6616915464401245, "num_tokens": 4083997.0, "repeat_count": 1.0, "routers_loss": 0.028780622407794, "skip_count": 2.0, "step": 2796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.000814358684092053, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 4086498.0, "repeat_count": 0.0, "routers_loss": 0.0014002255629748106, "skip_count": 0.0, "step": 2798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0008140569835053574, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 4089491.0, "repeat_count": 0.0, "routers_loss": 0.002101035788655281, "skip_count": 0.0, "step": 2800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.291332847778587, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.000813755093946369, "loss": 0.0052, "macro_f1": 0.32863849401474, "num_tokens": 4091976.0, "repeat_count": 1.0, "routers_loss": 0.023419123142957687, "skip_count": 0.0, "step": 2802, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.000813453015596739, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 4095067.0, "repeat_count": 0.0, "routers_loss": 0.0007170522003434598, "skip_count": 0.0, "step": 2804, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 16.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06494140625, "learning_rate": 0.0008131507486382318, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 4097782.0, "repeat_count": 0.0, "routers_loss": 0.006102937739342451, "skip_count": 4.0, "step": 2806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.326292789512017, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0008128482932527255, "loss": 0.007, "macro_f1": 0.32863849401474, "num_tokens": 4100925.0, "repeat_count": 0.0, "routers_loss": 0.009343058802187443, "skip_count": 1.0, "step": 2808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0008125456496222116, "loss": 0.0082, "macro_f1": 0.6666666865348816, "num_tokens": 4104312.0, "repeat_count": 0.0, "routers_loss": 0.008670558221638203, "skip_count": 2.0, "step": 2810, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.349599417334304, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0008122428179287948, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 4107317.0, "repeat_count": 1.0, "routers_loss": 0.009157488122582436, "skip_count": 4.0, "step": 2812, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0008119397983546931, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 4110105.0, "repeat_count": 0.0, "routers_loss": 0.0049841878935694695, "skip_count": 3.0, "step": 2814, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.37290604515659, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.171875, "learning_rate": 0.0008116365910822373, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 4113004.0, "repeat_count": 1.0, "routers_loss": 0.004639942664653063, "skip_count": 3.0, "step": 2816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.125, "learning_rate": 0.0008113331962938714, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4116833.0, "repeat_count": 0.0, "routers_loss": 0.002029155846685171, "skip_count": 0.0, "step": 2818, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.000811029614172152, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 4119493.0, "repeat_count": 0.0, "routers_loss": 0.0039411443285644054, "skip_count": 0.0, "step": 2820, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 16.407865986890023, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1435546875, "learning_rate": 0.0008107258448997486, "loss": 0.0086, "macro_f1": 0.9280423521995544, "num_tokens": 4122470.0, "repeat_count": 2.0, "routers_loss": 0.01255357451736927, "skip_count": 3.0, "step": 2822, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 16.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12255859375, "learning_rate": 0.000810421888659443, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4125715.0, "repeat_count": 1.0, "routers_loss": 0.011776686646044254, "skip_count": 1.0, "step": 2824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1328125, "learning_rate": 0.00081011774563413, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4128775.0, "repeat_count": 0.0, "routers_loss": 0.005971638020128012, "skip_count": 3.0, "step": 2826, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 16.442825928623453, "f1_execute": 0.9850746393203735, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1552734375, "learning_rate": 0.0008098134160068166, "loss": 0.0061, "macro_f1": 0.8839138746261597, "num_tokens": 4131726.0, "repeat_count": 2.0, "routers_loss": 0.017703929916024208, "skip_count": 1.0, "step": 2828, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.154296875, "learning_rate": 0.0008095088999606218, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4134645.0, "repeat_count": 0.0, "routers_loss": 0.005678823683410883, "skip_count": 2.0, "step": 2830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.345703125, "learning_rate": 0.0008092041976787771, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4137066.0, "repeat_count": 0.0, "routers_loss": 0.007632331922650337, "skip_count": 1.0, "step": 2832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.000808899309344626, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 4140434.0, "repeat_count": 0.0, "routers_loss": 0.004142462741583586, "skip_count": 0.0, "step": 2834, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.489439184268026, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1875, "learning_rate": 0.0008085942351416236, "loss": 0.0093, "macro_f1": 0.32863849401474, "num_tokens": 4143042.0, "repeat_count": 0.0, "routers_loss": 0.01912667602300644, "skip_count": 1.0, "step": 2836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0008082889752533374, "loss": 0.0088, "macro_f1": 0.3333333432674408, "num_tokens": 4146881.0, "repeat_count": 0.0, "routers_loss": 0.00043709587771445513, "skip_count": 0.0, "step": 2838, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.512745812090312, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.181640625, "learning_rate": 0.0008079835298634464, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 4149626.0, "repeat_count": 2.0, "routers_loss": 0.002349897986277938, "skip_count": 3.0, "step": 2840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09814453125, "learning_rate": 0.000807677899155741, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4152677.0, "repeat_count": 0.0, "routers_loss": 0.002850452670827508, "skip_count": 1.0, "step": 2842, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 16.5360524399126, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.0830078125, "learning_rate": 0.0008073720833141234, "loss": 0.0056, "macro_f1": 0.8839138746261597, "num_tokens": 4155506.0, "repeat_count": 1.0, "routers_loss": 0.018862644210457802, "skip_count": 2.0, "step": 2844, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.000807066082522607, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 4158704.0, "repeat_count": 0.0, "routers_loss": 0.002998821437358856, "skip_count": 0.0, "step": 2846, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 16.55935906773489, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1865234375, "learning_rate": 0.0008067598969653167, "loss": 0.0079, "macro_f1": 0.6666666865348816, "num_tokens": 4161673.0, "repeat_count": 1.0, "routers_loss": 0.004475760273635387, "skip_count": 0.0, "step": 2848, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.57101238164603, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1005859375, "learning_rate": 0.0008064535268264883, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 4164668.0, "repeat_count": 4.0, "routers_loss": 0.004282637499272823, "skip_count": 7.0, "step": 2850, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 16.582665695557175, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.0008061469722904689, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 4167327.0, "repeat_count": 1.0, "routers_loss": 0.0007599781383760273, "skip_count": 1.0, "step": 2852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0008058402335417165, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 4170086.0, "repeat_count": 0.0, "routers_loss": 0.006073831580579281, "skip_count": 2.0, "step": 2854, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1328125, "learning_rate": 0.0008055333107648, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 4174381.0, "repeat_count": 0.0, "routers_loss": 0.0033202553167939186, "skip_count": 2.0, "step": 2856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.0008052262041443986, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4177237.0, "repeat_count": 0.0, "routers_loss": 0.0024890173226594925, "skip_count": 1.0, "step": 2858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1552734375, "learning_rate": 0.0008049189138653028, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 4179783.0, "repeat_count": 0.0, "routers_loss": 0.0019647697918117046, "skip_count": 2.0, "step": 2860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11962890625, "learning_rate": 0.0008046114401124132, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 4182661.0, "repeat_count": 0.0, "routers_loss": 0.0010124179534614086, "skip_count": 0.0, "step": 2862, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.652585579024034, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1298828125, "learning_rate": 0.0008043037830707408, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 4185679.0, "repeat_count": 1.0, "routers_loss": 0.010107602924108505, "skip_count": 2.0, "step": 2864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 16.664238892935177, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1044921875, "learning_rate": 0.0008039959429254071, "loss": 0.0073, "macro_f1": 0.6139194369316101, "num_tokens": 4188382.0, "repeat_count": 0.0, "routers_loss": 0.026133716106414795, "skip_count": 4.0, "step": 2866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1005859375, "learning_rate": 0.0008036879198616433, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 4190932.0, "repeat_count": 0.0, "routers_loss": 0.003288610139861703, "skip_count": 0.0, "step": 2868, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 16.687545520757464, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.15625, "learning_rate": 0.0008033797140647915, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 4193825.0, "repeat_count": 1.0, "routers_loss": 0.0025086880195885897, "skip_count": 0.0, "step": 2870, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.166015625, "learning_rate": 0.0008030713257203029, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4197660.0, "repeat_count": 0.0, "routers_loss": 0.002417100127786398, "skip_count": 0.0, "step": 2872, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 16.710852148579754, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.177734375, "learning_rate": 0.000802762755013739, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4200340.0, "repeat_count": 1.0, "routers_loss": 0.0068132695741951466, "skip_count": 0.0, "step": 2874, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.34375, "learning_rate": 0.0008024540021307708, "loss": 0.0077, "macro_f1": 0.6666666865348816, "num_tokens": 4202980.0, "repeat_count": 0.0, "routers_loss": 0.0034154518507421017, "skip_count": 3.0, "step": 2876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.349609375, "learning_rate": 0.0008021450672571794, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4205426.0, "repeat_count": 0.0, "routers_loss": 0.0034068122040480375, "skip_count": 0.0, "step": 2878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.173828125, "learning_rate": 0.0008018359505788547, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4207821.0, "repeat_count": 0.0, "routers_loss": 0.006619867403060198, "skip_count": 1.0, "step": 2880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.255859375, "learning_rate": 0.0008015266522817964, "loss": 0.0091, "macro_f1": 0.3333333432674408, "num_tokens": 4211721.0, "repeat_count": 0.0, "routers_loss": 0.0034853401593863964, "skip_count": 0.0, "step": 2882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12255859375, "learning_rate": 0.0008012171725521136, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4214344.0, "repeat_count": 0.0, "routers_loss": 0.01007680781185627, "skip_count": 2.0, "step": 2884, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 39.0, "epoch": 16.780772032046613, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12109375, "learning_rate": 0.0008009075115760243, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4217152.0, "repeat_count": 3.0, "routers_loss": 0.0009838249534368515, "skip_count": 0.0, "step": 2886, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.792425345957756, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.140625, "learning_rate": 0.0008005976695398553, "loss": 0.0102, "macro_f1": 1.0, "num_tokens": 4220587.0, "repeat_count": 1.0, "routers_loss": 0.0009835244854912162, "skip_count": 2.0, "step": 2888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.8040786598689, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2041015625, "learning_rate": 0.0008002876466300435, "loss": 0.0077, "macro_f1": 0.661835789680481, "num_tokens": 4223057.0, "repeat_count": 1.0, "routers_loss": 0.024671584367752075, "skip_count": 1.0, "step": 2890, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0007999774430331333, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 4225510.0, "repeat_count": 0.0, "routers_loss": 0.005015290342271328, "skip_count": 0.0, "step": 2892, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.827385287691186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1513671875, "learning_rate": 0.0007996670589357781, "loss": 0.0088, "macro_f1": 1.0, "num_tokens": 4228834.0, "repeat_count": 1.0, "routers_loss": 0.003935507498681545, "skip_count": 2.0, "step": 2894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 16.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.16796875, "learning_rate": 0.0007993564945247409, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4231328.0, "repeat_count": 0.0, "routers_loss": 0.004488540347665548, "skip_count": 6.0, "step": 2896, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 16.850691915513472, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.000799045749986892, "loss": 0.0054, "macro_f1": 0.6616915464401245, "num_tokens": 4234540.0, "repeat_count": 2.0, "routers_loss": 0.007366691250354052, "skip_count": 1.0, "step": 2898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.087890625, "learning_rate": 0.0007987348255092104, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 4237459.0, "repeat_count": 0.0, "routers_loss": 0.00285612721927464, "skip_count": 1.0, "step": 2900, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.873998543335762, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0007984237212787839, "loss": 0.0073, "macro_f1": 1.0, "num_tokens": 4240311.0, "repeat_count": 1.0, "routers_loss": 0.0027112679090350866, "skip_count": 4.0, "step": 2902, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.885651857246906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.294921875, "learning_rate": 0.0007981124374828079, "loss": 0.0086, "macro_f1": 0.3333333432674408, "num_tokens": 4243270.0, "repeat_count": 0.0, "routers_loss": 0.001036914880387485, "skip_count": 0.0, "step": 2904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.89730517115805, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0007978009743085862, "loss": 0.0055, "macro_f1": 0.32863849401474, "num_tokens": 4245929.0, "repeat_count": 1.0, "routers_loss": 0.03184618055820465, "skip_count": 0.0, "step": 2906, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.00079748933194353, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 4248544.0, "repeat_count": 0.0, "routers_loss": 0.0035733357071876526, "skip_count": 0.0, "step": 2908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 16.920611798980335, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2119140625, "learning_rate": 0.000797177510575159, "loss": 0.0065, "macro_f1": 0.5507246255874634, "num_tokens": 4251219.0, "repeat_count": 0.0, "routers_loss": 0.013821439817547798, "skip_count": 2.0, "step": 2910, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.0007968655103911003, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 4253767.0, "repeat_count": 0.0, "routers_loss": 0.0015896944096311927, "skip_count": 0.0, "step": 2912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 16.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09228515625, "learning_rate": 0.0007965533315790883, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 4256587.0, "repeat_count": 0.0, "routers_loss": 0.005335025954991579, "skip_count": 3.0, "step": 2914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0007962409743269654, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 4259883.0, "repeat_count": 0.0, "routers_loss": 0.004477449227124453, "skip_count": 0.0, "step": 2916, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 16.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.0007959284388226811, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4262538.0, "repeat_count": 0.0, "routers_loss": 0.0021851530764251947, "skip_count": 1.0, "step": 2918, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 16.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.0007956157252542923, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 4265509.0, "repeat_count": 0.0, "routers_loss": 0.0022303354926407337, "skip_count": 0.0, "step": 2920, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 16.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11767578125, "learning_rate": 0.0007953028338099627, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 4268522.0, "repeat_count": 0.0, "routers_loss": 0.011767344549298286, "skip_count": 2.0, "step": 2922, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 17.0, "f1_execute": 0.9850746393203735, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 0.1767578125, "learning_rate": 0.0007949897646779635, "loss": 0.0099, "macro_f1": 0.5950249433517456, "num_tokens": 4270672.0, "repeat_count": 3.0, "routers_loss": 0.02770444191992283, "skip_count": 0.0, "step": 2924, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.0007946765180466724, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 4274134.0, "repeat_count": 0.0, "routers_loss": 0.006146728526800871, "skip_count": 0.0, "step": 2926, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0007943630941045743, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 4276705.0, "repeat_count": 0.0, "routers_loss": 0.0025843556504696608, "skip_count": 0.0, "step": 2928, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.03495994173343, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1572265625, "learning_rate": 0.0007940494930402604, "loss": 0.007, "macro_f1": 0.32863849401474, "num_tokens": 4279644.0, "repeat_count": 1.0, "routers_loss": 0.015761200338602066, "skip_count": 0.0, "step": 2930, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 17.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007937357150424289, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4282675.0, "repeat_count": 0.0, "routers_loss": 0.015230057761073112, "skip_count": 3.0, "step": 2932, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.0007934217602998839, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4286124.0, "repeat_count": 0.0, "routers_loss": 0.002867731498554349, "skip_count": 0.0, "step": 2934, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1015625, "learning_rate": 0.0007931076290015364, "loss": 0.007, "macro_f1": 0.3333333432674408, "num_tokens": 4288997.0, "repeat_count": 0.0, "routers_loss": 0.0023414765018969774, "skip_count": 0.0, "step": 2936, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.081573197378006, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08447265625, "learning_rate": 0.0007927933213364032, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 4291927.0, "repeat_count": 2.0, "routers_loss": 0.0030728608835488558, "skip_count": 4.0, "step": 2938, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.09322651128915, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12060546875, "learning_rate": 0.0007924788374936079, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 4294548.0, "repeat_count": 1.0, "routers_loss": 0.008073138073086739, "skip_count": 3.0, "step": 2940, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0007921641776623789, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 4297944.0, "repeat_count": 0.0, "routers_loss": 0.008657608181238174, "skip_count": 2.0, "step": 2942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.361328125, "learning_rate": 0.0007918493420320518, "loss": 0.0103, "macro_f1": 0.6666666865348816, "num_tokens": 4300641.0, "repeat_count": 0.0, "routers_loss": 0.0014251169050112367, "skip_count": 2.0, "step": 2944, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.22265625, "learning_rate": 0.0007915343307920673, "loss": 0.0087, "macro_f1": 0.3333333432674408, "num_tokens": 4303184.0, "repeat_count": 0.0, "routers_loss": 0.0008262580959126353, "skip_count": 0.0, "step": 2946, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2734375, "learning_rate": 0.0007912191441319719, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 4305749.0, "repeat_count": 0.0, "routers_loss": 0.0016430013347417116, "skip_count": 2.0, "step": 2948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0007909037822414176, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4308475.0, "repeat_count": 0.0, "routers_loss": 0.0037778967525810003, "skip_count": 0.0, "step": 2950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0007905882453101617, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 4312046.0, "repeat_count": 0.0, "routers_loss": 0.0020150032360106707, "skip_count": 0.0, "step": 2952, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0007902725335280673, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4314938.0, "repeat_count": 0.0, "routers_loss": 0.0031595672480762005, "skip_count": 0.0, "step": 2954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.000789956647085102, "loss": 0.0091, "macro_f1": 0.6666666865348816, "num_tokens": 4317995.0, "repeat_count": 0.0, "routers_loss": 0.002940410515293479, "skip_count": 1.0, "step": 2956, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 17.198106336489438, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.375, "learning_rate": 0.0007896405861713394, "loss": 0.0073, "macro_f1": 0.9280423521995544, "num_tokens": 4320639.0, "repeat_count": 2.0, "routers_loss": 0.01009267196059227, "skip_count": 3.0, "step": 2958, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11328125, "learning_rate": 0.0007893243509769573, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 4323539.0, "repeat_count": 0.0, "routers_loss": 0.0021132221445441246, "skip_count": 2.0, "step": 2960, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 17.221412964311725, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0007890079416922386, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 4326108.0, "repeat_count": 2.0, "routers_loss": 0.0014713496202602983, "skip_count": 2.0, "step": 2962, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 17.23306627822287, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2099609375, "learning_rate": 0.0007886913585075712, "loss": 0.007, "macro_f1": 0.5507246255874634, "num_tokens": 4328868.0, "repeat_count": 0.0, "routers_loss": 0.02117788791656494, "skip_count": 2.0, "step": 2964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.244719592134015, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1279296875, "learning_rate": 0.0007883746016134474, "loss": 0.0072, "macro_f1": 0.661835789680481, "num_tokens": 4332984.0, "repeat_count": 1.0, "routers_loss": 0.02358441986143589, "skip_count": 1.0, "step": 2966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1533203125, "learning_rate": 0.0007880576712004639, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 4335774.0, "repeat_count": 0.0, "routers_loss": 0.0032006590627133846, "skip_count": 2.0, "step": 2968, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 17.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10498046875, "learning_rate": 0.0007877405674593221, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4338394.0, "repeat_count": 1.0, "routers_loss": 0.0031376942060887814, "skip_count": 0.0, "step": 2970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.28515625, "learning_rate": 0.0007874232905808274, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 4341089.0, "repeat_count": 0.0, "routers_loss": 0.004887557588517666, "skip_count": 2.0, "step": 2972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1513671875, "learning_rate": 0.0007871058407558899, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 4343554.0, "repeat_count": 0.0, "routers_loss": 0.005631360691040754, "skip_count": 1.0, "step": 2974, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.30298616168973, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2119140625, "learning_rate": 0.0007867882181755231, "loss": 0.0038, "macro_f1": 1.0, "num_tokens": 4346609.0, "repeat_count": 1.0, "routers_loss": 0.0009087923099286854, "skip_count": 2.0, "step": 2976, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 17.314639475600874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0007864704230308448, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 4349068.0, "repeat_count": 1.0, "routers_loss": 0.009676039218902588, "skip_count": 1.0, "step": 2978, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0007861524555130768, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 4352202.0, "repeat_count": 0.0, "routers_loss": 0.00398996751755476, "skip_count": 0.0, "step": 2980, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 17.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09521484375, "learning_rate": 0.0007858343158135442, "loss": 0.008, "macro_f1": 0.6666666865348816, "num_tokens": 4355129.0, "repeat_count": 0.0, "routers_loss": 0.006882606074213982, "skip_count": 4.0, "step": 2982, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.349599417334304, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.000785516004123676, "loss": 0.0086, "macro_f1": 0.32863849401474, "num_tokens": 4357905.0, "repeat_count": 0.0, "routers_loss": 0.03665498271584511, "skip_count": 1.0, "step": 2984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.7777777910232544, "avg_layers": 29.0, "epoch": 17.361252731245447, "f1_execute": 0.964285671710968, "f1_repeat": 0.0, "f1_skip": 0.875, "grad_norm": 0.2353515625, "learning_rate": 0.0007851975206350046, "loss": 0.0064, "macro_f1": 0.613095223903656, "num_tokens": 4360941.0, "repeat_count": 0.0, "routers_loss": 0.024045439437031746, "skip_count": 9.0, "step": 2986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.0007848788655391658, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 4363985.0, "repeat_count": 0.0, "routers_loss": 0.003400679212063551, "skip_count": 0.0, "step": 2988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.0007845600390278984, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4367059.0, "repeat_count": 0.0, "routers_loss": 0.006287879776209593, "skip_count": 2.0, "step": 2990, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 17.39621267297888, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.13671875, "learning_rate": 0.0007842410412930452, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 4370087.0, "repeat_count": 1.0, "routers_loss": 0.0031063950154930353, "skip_count": 1.0, "step": 2992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.30859375, "learning_rate": 0.0007839218725265507, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4372914.0, "repeat_count": 0.0, "routers_loss": 0.0033313417807221413, "skip_count": 1.0, "step": 2994, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 17.419519300801166, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.11181640625, "learning_rate": 0.0007836025329204635, "loss": 0.0045, "macro_f1": 0.9280423521995544, "num_tokens": 4375622.0, "repeat_count": 2.0, "routers_loss": 0.012668107636272907, "skip_count": 3.0, "step": 2996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.000783283022666934, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4378937.0, "repeat_count": 0.0, "routers_loss": 0.005313367582857609, "skip_count": 0.0, "step": 2998, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 17.442825928623453, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12451171875, "learning_rate": 0.0007829633419582165, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4382475.0, "repeat_count": 1.0, "routers_loss": 0.0005645286873914301, "skip_count": 0.0, "step": 3000, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 17.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1884765625, "learning_rate": 0.0007826434909866667, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 4385365.0, "repeat_count": 1.0, "routers_loss": 0.0025123246014118195, "skip_count": 0.0, "step": 3002, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.46613255644574, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2119140625, "learning_rate": 0.0007823234699447432, "loss": 0.0074, "macro_f1": 1.0, "num_tokens": 4388033.0, "repeat_count": 1.0, "routers_loss": 0.002948429901152849, "skip_count": 2.0, "step": 3004, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.0007820032790250074, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 4391258.0, "repeat_count": 0.0, "routers_loss": 0.003329492174088955, "skip_count": 0.0, "step": 3006, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.489439184268026, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1162109375, "learning_rate": 0.0007816829184201218, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 4394302.0, "repeat_count": 1.0, "routers_loss": 0.001583873643539846, "skip_count": 2.0, "step": 3008, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 17.50109249817917, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.298828125, "learning_rate": 0.000781362388322852, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 4396972.0, "repeat_count": 1.0, "routers_loss": 0.003610332030802965, "skip_count": 1.0, "step": 3010, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.146484375, "learning_rate": 0.0007810416889260653, "loss": 0.0075, "macro_f1": 0.6666666865348816, "num_tokens": 4399792.0, "repeat_count": 0.0, "routers_loss": 0.005222017411142588, "skip_count": 2.0, "step": 3012, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0007807208204227308, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 4402525.0, "repeat_count": 0.0, "routers_loss": 0.0022943553049117327, "skip_count": 0.0, "step": 3014, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2265625, "learning_rate": 0.0007803997830059193, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 4405556.0, "repeat_count": 0.0, "routers_loss": 0.002710965694859624, "skip_count": 0.0, "step": 3016, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 39.0, "epoch": 17.547705753823745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.259765625, "learning_rate": 0.0007800785768688035, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 4409280.0, "repeat_count": 3.0, "routers_loss": 0.0006677596247754991, "skip_count": 0.0, "step": 3018, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007797572022046572, "loss": 0.0071, "macro_f1": 0.6666666865348816, "num_tokens": 4412624.0, "repeat_count": 0.0, "routers_loss": 0.003747419686987996, "skip_count": 1.0, "step": 3020, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 17.57101238164603, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.146484375, "learning_rate": 0.0007794356592068559, "loss": 0.0043, "macro_f1": 0.9470900297164917, "num_tokens": 4415517.0, "repeat_count": 1.0, "routers_loss": 0.017081454396247864, "skip_count": 4.0, "step": 3022, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 17.582665695557175, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0007791139480688762, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 4418158.0, "repeat_count": 1.0, "routers_loss": 0.0009213386219926178, "skip_count": 0.0, "step": 3024, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007787920689842964, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4421420.0, "repeat_count": 0.0, "routers_loss": 0.0007278395351022482, "skip_count": 0.0, "step": 3026, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.130859375, "learning_rate": 0.0007784700221467952, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 4424831.0, "repeat_count": 0.0, "routers_loss": 0.0035087226424366236, "skip_count": 0.0, "step": 3028, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0007781478077501524, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 4427493.0, "repeat_count": 0.0, "routers_loss": 0.0009435266256332397, "skip_count": 0.0, "step": 3030, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048095703125, "learning_rate": 0.0007778254259882489, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 4430152.0, "repeat_count": 0.0, "routers_loss": 0.0009707071003504097, "skip_count": 1.0, "step": 3032, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2451171875, "learning_rate": 0.0007775028770550661, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 4432629.0, "repeat_count": 0.0, "routers_loss": 0.0012362857814878225, "skip_count": 1.0, "step": 3034, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1474609375, "learning_rate": 0.0007771801611446858, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 4435464.0, "repeat_count": 0.0, "routers_loss": 0.005847550462931395, "skip_count": 2.0, "step": 3036, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.664238892935177, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0007768572784512907, "loss": 0.0035, "macro_f1": 1.0, "num_tokens": 4437884.0, "repeat_count": 1.0, "routers_loss": 0.0018905765609815717, "skip_count": 2.0, "step": 3038, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.67589220684632, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.0007765342291691636, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 4441306.0, "repeat_count": 1.0, "routers_loss": 0.0026973977219313383, "skip_count": 2.0, "step": 3040, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 17.687545520757464, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.2421875, "learning_rate": 0.0007762110134926876, "loss": 0.0081, "macro_f1": 0.928205132484436, "num_tokens": 4444020.0, "repeat_count": 1.0, "routers_loss": 0.0270033348351717, "skip_count": 3.0, "step": 3042, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0007758876316163458, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 4446956.0, "repeat_count": 0.0, "routers_loss": 0.0003837387193925679, "skip_count": 0.0, "step": 3044, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19921875, "learning_rate": 0.0007755640837347215, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4449796.0, "repeat_count": 0.0, "routers_loss": 0.005019268486648798, "skip_count": 0.0, "step": 3046, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 17.722505462490897, "f1_execute": 0.9824560880661011, "f1_repeat": 1.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.1220703125, "learning_rate": 0.0007752403700424978, "loss": 0.0053, "macro_f1": 0.9638490676879883, "num_tokens": 4452913.0, "repeat_count": 2.0, "routers_loss": 0.012497094459831715, "skip_count": 5.0, "step": 3048, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0007749164907344575, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 4455716.0, "repeat_count": 0.0, "routers_loss": 0.0008506746380589902, "skip_count": 0.0, "step": 3050, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0007745924460054831, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 4458189.0, "repeat_count": 0.0, "routers_loss": 0.0033727267291396856, "skip_count": 2.0, "step": 3052, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.076171875, "learning_rate": 0.0007742682360505569, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 4461258.0, "repeat_count": 0.0, "routers_loss": 0.00579648744314909, "skip_count": 0.0, "step": 3054, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 17.76911871813547, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.119140625, "learning_rate": 0.00077394386106476, "loss": 0.0128, "macro_f1": 0.5507246255874634, "num_tokens": 4463833.0, "repeat_count": 0.0, "routers_loss": 0.026848729699850082, "skip_count": 2.0, "step": 3056, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2275390625, "learning_rate": 0.0007736193212432733, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4466337.0, "repeat_count": 0.0, "routers_loss": 0.003184059401974082, "skip_count": 1.0, "step": 3058, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 17.792425345957756, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08251953125, "learning_rate": 0.0007732946167813768, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 4470388.0, "repeat_count": 1.0, "routers_loss": 0.0036660342011600733, "skip_count": 1.0, "step": 3060, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1201171875, "learning_rate": 0.0007729697478744496, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4473291.0, "repeat_count": 0.0, "routers_loss": 0.0035748269874602556, "skip_count": 0.0, "step": 3062, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.166015625, "learning_rate": 0.0007726447147179696, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 4475734.0, "repeat_count": 0.0, "routers_loss": 0.0006365819717757404, "skip_count": 1.0, "step": 3064, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 32.0, "epoch": 17.827385287691186, "f1_execute": 0.9677419066429138, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1064453125, "learning_rate": 0.0007723195175075137, "loss": 0.0084, "macro_f1": 0.5892473459243774, "num_tokens": 4478521.0, "repeat_count": 0.0, "routers_loss": 0.02398105151951313, "skip_count": 6.0, "step": 3066, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 17.83903860160233, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.173828125, "learning_rate": 0.0007719941564387572, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 4481322.0, "repeat_count": 1.0, "routers_loss": 0.0005484108114615083, "skip_count": 4.0, "step": 3068, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 17.850691915513472, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.0007716686317074742, "loss": 0.0074, "macro_f1": 0.661835789680481, "num_tokens": 4484332.0, "repeat_count": 1.0, "routers_loss": 0.00908005889505148, "skip_count": 1.0, "step": 3070, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1640625, "learning_rate": 0.0007713429435095375, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4487074.0, "repeat_count": 0.0, "routers_loss": 0.004841210786253214, "skip_count": 1.0, "step": 3072, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.873998543335762, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.150390625, "learning_rate": 0.000771017092040918, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 4490063.0, "repeat_count": 1.0, "routers_loss": 0.007000204641371965, "skip_count": 2.0, "step": 3074, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 17.885651857246906, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.244140625, "learning_rate": 0.0007706910774976848, "loss": 0.0079, "macro_f1": 0.6139194369316101, "num_tokens": 4493439.0, "repeat_count": 0.0, "routers_loss": 0.03246635943651199, "skip_count": 4.0, "step": 3076, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0007703649000760053, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 4497547.0, "repeat_count": 0.0, "routers_loss": 0.00267224945127964, "skip_count": 0.0, "step": 3078, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.0007700385599721448, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 4500476.0, "repeat_count": 0.0, "routers_loss": 0.002994121517986059, "skip_count": 1.0, "step": 3080, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 17.920611798980335, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07568359375, "learning_rate": 0.0007697120573824666, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 4503223.0, "repeat_count": 1.0, "routers_loss": 0.0025838182773441076, "skip_count": 2.0, "step": 3082, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.93226511289148, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1337890625, "learning_rate": 0.0007693853925034315, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 4505720.0, "repeat_count": 1.0, "routers_loss": 0.004892371129244566, "skip_count": 3.0, "step": 3084, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 17.94391842680262, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.271484375, "learning_rate": 0.0007690585655315982, "loss": 0.0092, "macro_f1": 0.6666666865348816, "num_tokens": 4507963.0, "repeat_count": 1.0, "routers_loss": 0.0018665026873350143, "skip_count": 0.0, "step": 3086, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 17.955571740713765, "f1_execute": 0.9850746393203735, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 0.142578125, "learning_rate": 0.0007687315766636229, "loss": 0.0078, "macro_f1": 0.5950249433517456, "num_tokens": 4510744.0, "repeat_count": 3.0, "routers_loss": 0.014644026756286621, "skip_count": 0.0, "step": 3088, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 17.967225054624908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.212890625, "learning_rate": 0.0007684044260962593, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 4513579.0, "repeat_count": 2.0, "routers_loss": 0.004589893855154514, "skip_count": 1.0, "step": 3090, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 17.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1318359375, "learning_rate": 0.0007680771140263581, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 4516103.0, "repeat_count": 0.0, "routers_loss": 0.0020069745369255543, "skip_count": 0.0, "step": 3092, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 17.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.0007677496406508673, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 4519643.0, "repeat_count": 0.0, "routers_loss": 0.003516228636726737, "skip_count": 2.0, "step": 3094, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 18.0, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11181640625, "learning_rate": 0.0007674220061668323, "loss": 0.0066, "macro_f1": 0.5507246255874634, "num_tokens": 4521888.0, "repeat_count": 0.0, "routers_loss": 0.010026657953858376, "skip_count": 2.0, "step": 3096, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 18.011653313911143, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1259765625, "learning_rate": 0.0007670942107713948, "loss": 0.0071, "macro_f1": 0.5898990035057068, "num_tokens": 4524671.0, "repeat_count": 1.0, "routers_loss": 0.03651973977684975, "skip_count": 3.0, "step": 3098, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 18.023306627822286, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.177734375, "learning_rate": 0.0007667662546617938, "loss": 0.0078, "macro_f1": 0.8839138746261597, "num_tokens": 4527125.0, "repeat_count": 1.0, "routers_loss": 0.007659326773136854, "skip_count": 2.0, "step": 3100, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.000766438138035365, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 4529796.0, "repeat_count": 1.0, "routers_loss": 0.001401754328981042, "skip_count": 0.0, "step": 3102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.0007661098610895406, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4534137.0, "repeat_count": 0.0, "routers_loss": 0.0028102376963943243, "skip_count": 2.0, "step": 3104, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.058266569555716, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0791015625, "learning_rate": 0.000765781424021849, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 4537287.0, "repeat_count": 2.0, "routers_loss": 0.011731578037142754, "skip_count": 4.0, "step": 3106, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.06991988346686, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0007654528270299155, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 4540491.0, "repeat_count": 1.0, "routers_loss": 0.006713114213198423, "skip_count": 4.0, "step": 3108, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 37.0, "epoch": 18.081573197378006, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.130859375, "learning_rate": 0.000765124070311461, "loss": 0.0064, "macro_f1": 0.8837606906890869, "num_tokens": 4543783.0, "repeat_count": 2.0, "routers_loss": 0.027578750625252724, "skip_count": 2.0, "step": 3110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 18.09322651128915, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.078125, "learning_rate": 0.0007647951540643027, "loss": 0.0046, "macro_f1": 0.5507246255874634, "num_tokens": 4546869.0, "repeat_count": 0.0, "routers_loss": 0.011213799007236958, "skip_count": 2.0, "step": 3112, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 18.104879825200292, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.142578125, "learning_rate": 0.0007644660784863547, "loss": 0.0076, "macro_f1": 0.8839138746261597, "num_tokens": 4549464.0, "repeat_count": 1.0, "routers_loss": 0.0118503887206316, "skip_count": 2.0, "step": 3114, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.116533139111436, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0007641368437756253, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 4551780.0, "repeat_count": 1.0, "routers_loss": 0.0026212066877633333, "skip_count": 0.0, "step": 3116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1845703125, "learning_rate": 0.0007638074501302199, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4554791.0, "repeat_count": 0.0, "routers_loss": 0.006907620467245579, "skip_count": 2.0, "step": 3118, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05908203125, "learning_rate": 0.0007634778977483389, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 4557500.0, "repeat_count": 0.0, "routers_loss": 0.004418304190039635, "skip_count": 2.0, "step": 3120, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 18.151493080844865, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1650390625, "learning_rate": 0.0007631481868282784, "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 4560122.0, "repeat_count": 1.0, "routers_loss": 0.00661056162789464, "skip_count": 1.0, "step": 3122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.0007628183175684298, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 4562509.0, "repeat_count": 0.0, "routers_loss": 0.006531216204166412, "skip_count": 3.0, "step": 3124, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.17479970866715, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09423828125, "learning_rate": 0.00076248829016728, "loss": 0.0081, "macro_f1": 1.0, "num_tokens": 4565242.0, "repeat_count": 1.0, "routers_loss": 0.011233242228627205, "skip_count": 3.0, "step": 3126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.177734375, "learning_rate": 0.0007621581048234111, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4568352.0, "repeat_count": 0.0, "routers_loss": 0.005626725498586893, "skip_count": 2.0, "step": 3128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.0007618277617354996, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 4571075.0, "repeat_count": 0.0, "routers_loss": 0.0018310397863388062, "skip_count": 0.0, "step": 3130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08935546875, "learning_rate": 0.0007614972611023177, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 4574072.0, "repeat_count": 0.0, "routers_loss": 0.011201203800737858, "skip_count": 1.0, "step": 3132, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.221412964311725, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.154296875, "learning_rate": 0.0007611666031227316, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4577085.0, "repeat_count": 1.0, "routers_loss": 0.002454784931614995, "skip_count": 0.0, "step": 3134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1640625, "learning_rate": 0.0007608357879957033, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 4579575.0, "repeat_count": 0.0, "routers_loss": 0.0032074113842099905, "skip_count": 1.0, "step": 3136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0007605048159202883, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4582082.0, "repeat_count": 0.0, "routers_loss": 0.0015993226552382112, "skip_count": 0.0, "step": 3138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.256372906045158, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.09814453125, "learning_rate": 0.0007601736870956368, "loss": 0.0047, "macro_f1": 0.5507246255874634, "num_tokens": 4584932.0, "repeat_count": 0.0, "routers_loss": 0.008499354124069214, "skip_count": 1.0, "step": 3140, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 39.0, "epoch": 18.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0986328125, "learning_rate": 0.0007598424017209938, "loss": 0.0139, "macro_f1": 1.0, "num_tokens": 4587533.0, "repeat_count": 4.0, "routers_loss": 0.00567911472171545, "skip_count": 1.0, "step": 3142, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.279679533867444, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0007595109599956978, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 4590419.0, "repeat_count": 1.0, "routers_loss": 0.001263721496798098, "skip_count": 0.0, "step": 3144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0007591793621191819, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 4593210.0, "repeat_count": 0.0, "routers_loss": 0.001270989771001041, "skip_count": 0.0, "step": 3146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 18.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1171875, "learning_rate": 0.0007588476082909732, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4596242.0, "repeat_count": 0.0, "routers_loss": 0.008861967362463474, "skip_count": 6.0, "step": 3148, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.314639475600874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.000758515698710692, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 4598670.0, "repeat_count": 5.0, "routers_loss": 0.004959133919328451, "skip_count": 8.0, "step": 3150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0007581836335780533, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 4601480.0, "repeat_count": 0.0, "routers_loss": 0.002256895648315549, "skip_count": 2.0, "step": 3152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.0007578514130928642, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 4605777.0, "repeat_count": 0.0, "routers_loss": 0.002655585529282689, "skip_count": 0.0, "step": 3154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0007575190374550272, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4608776.0, "repeat_count": 0.0, "routers_loss": 0.0005433980841189623, "skip_count": 0.0, "step": 3156, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 18.361252731245447, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.447265625, "learning_rate": 0.0007571865068645365, "loss": 0.0075, "macro_f1": 1.0, "num_tokens": 4611746.0, "repeat_count": 1.0, "routers_loss": 0.005731430370360613, "skip_count": 1.0, "step": 3158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2265625, "learning_rate": 0.0007568538215214807, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 4614352.0, "repeat_count": 0.0, "routers_loss": 0.0017417741473764181, "skip_count": 2.0, "step": 3160, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09375, "learning_rate": 0.0007565209816260407, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4617516.0, "repeat_count": 0.0, "routers_loss": 0.0009556955774314702, "skip_count": 0.0, "step": 3162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1728515625, "learning_rate": 0.0007561879873784909, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4620345.0, "repeat_count": 0.0, "routers_loss": 0.0021337580401450396, "skip_count": 3.0, "step": 3164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0007558548389791983, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 4622740.0, "repeat_count": 0.0, "routers_loss": 0.0006596893072128296, "skip_count": 0.0, "step": 3166, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 18.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.146484375, "learning_rate": 0.0007555215366286227, "loss": 0.0069, "macro_f1": 1.0, "num_tokens": 4625525.0, "repeat_count": 3.0, "routers_loss": 0.006088367197662592, "skip_count": 2.0, "step": 3168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06005859375, "learning_rate": 0.0007551880805273165, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 4629533.0, "repeat_count": 0.0, "routers_loss": 0.005842882674187422, "skip_count": 3.0, "step": 3170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.0007548544708759252, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 4632338.0, "repeat_count": 0.0, "routers_loss": 0.0010054544545710087, "skip_count": 0.0, "step": 3172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 18.454479242534596, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.201171875, "learning_rate": 0.0007545207078751857, "loss": 0.0082, "macro_f1": 0.6139194369316101, "num_tokens": 4635348.0, "repeat_count": 0.0, "routers_loss": 0.023459482938051224, "skip_count": 4.0, "step": 3174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.107421875, "learning_rate": 0.0007541867917259278, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 4638251.0, "repeat_count": 0.0, "routers_loss": 0.0019063291838392615, "skip_count": 0.0, "step": 3176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0007538527226290735, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 4641886.0, "repeat_count": 0.0, "routers_loss": 0.0006481746677309275, "skip_count": 0.0, "step": 3178, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11572265625, "learning_rate": 0.0007535185007856364, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 4645275.0, "repeat_count": 0.0, "routers_loss": 0.0020664785988628864, "skip_count": 0.0, "step": 3180, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.50109249817917, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.173828125, "learning_rate": 0.0007531841263967221, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 4647922.0, "repeat_count": 1.0, "routers_loss": 0.0005667977384291589, "skip_count": 4.0, "step": 3182, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 18.512745812090312, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1064453125, "learning_rate": 0.0007528495996635288, "loss": 0.0076, "macro_f1": 1.0, "num_tokens": 4650834.0, "repeat_count": 2.0, "routers_loss": 0.001985687529668212, "skip_count": 2.0, "step": 3184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08056640625, "learning_rate": 0.000752514920787345, "loss": 0.0084, "macro_f1": 0.6666666865348816, "num_tokens": 4653472.0, "repeat_count": 0.0, "routers_loss": 0.007347998674958944, "skip_count": 1.0, "step": 3186, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 18.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.0007521800899695519, "loss": 0.0061, "macro_f1": 1.0, "num_tokens": 4656317.0, "repeat_count": 1.0, "routers_loss": 0.00606652582064271, "skip_count": 1.0, "step": 3188, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1298828125, "learning_rate": 0.0007518451074116216, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 4658849.0, "repeat_count": 0.0, "routers_loss": 0.002041024621576071, "skip_count": 1.0, "step": 3190, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.55935906773489, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0986328125, "learning_rate": 0.0007515099733151177, "loss": 0.0079, "macro_f1": 0.6616915464401245, "num_tokens": 4661962.0, "repeat_count": 1.0, "routers_loss": 0.02985851652920246, "skip_count": 2.0, "step": 3192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 18.57101238164603, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1328125, "learning_rate": 0.0007511746878816944, "loss": 0.0064, "macro_f1": 0.6139194369316101, "num_tokens": 4664516.0, "repeat_count": 0.0, "routers_loss": 0.023361852392554283, "skip_count": 4.0, "step": 3194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 31.0, "epoch": 18.582665695557175, "f1_execute": 0.9836065173149109, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.08984375, "learning_rate": 0.0007508392513130979, "loss": 0.0042, "macro_f1": 0.63089919090271, "num_tokens": 4666983.0, "repeat_count": 0.0, "routers_loss": 0.011119939386844635, "skip_count": 6.0, "step": 3196, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1552734375, "learning_rate": 0.0007505036638111648, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 4669948.0, "repeat_count": 0.0, "routers_loss": 0.0029009091667830944, "skip_count": 1.0, "step": 3198, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.60597232337946, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0007501679255778224, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 4673532.0, "repeat_count": 1.0, "routers_loss": 0.0005598510033451021, "skip_count": 0.0, "step": 3200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 18.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08251953125, "learning_rate": 0.0007498320368150891, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 4676752.0, "repeat_count": 0.0, "routers_loss": 0.0011387091362848878, "skip_count": 3.0, "step": 3202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.17578125, "learning_rate": 0.0007494959977250738, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4680042.0, "repeat_count": 0.0, "routers_loss": 0.0007510576397180557, "skip_count": 2.0, "step": 3204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.0007491598085099752, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 4683142.0, "repeat_count": 0.0, "routers_loss": 0.0005528829642571509, "skip_count": 1.0, "step": 3206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0771484375, "learning_rate": 0.0007488234693720832, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 4685675.0, "repeat_count": 0.0, "routers_loss": 0.0033823202829807997, "skip_count": 0.0, "step": 3208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.664238892935177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0007484869805137777, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4687961.0, "repeat_count": 0.0, "routers_loss": 0.002773819724097848, "skip_count": 0.0, "step": 3210, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 18.67589220684632, "f1_execute": 0.9841269850730896, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.1025390625, "learning_rate": 0.0007481503421375282, "loss": 0.0057, "macro_f1": 0.9280423521995544, "num_tokens": 4691086.0, "repeat_count": 3.0, "routers_loss": 0.026026049628853798, "skip_count": 2.0, "step": 3212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0007478135544458949, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 4694328.0, "repeat_count": 0.0, "routers_loss": 0.0010268461192026734, "skip_count": 0.0, "step": 3214, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.699198834668607, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0007474766176415271, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 4696889.0, "repeat_count": 1.0, "routers_loss": 0.0010767681524157524, "skip_count": 0.0, "step": 3216, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 18.710852148579754, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0007471395319271644, "loss": 0.0065, "macro_f1": 0.6616915464401245, "num_tokens": 4699329.0, "repeat_count": 2.0, "routers_loss": 0.01630355790257454, "skip_count": 1.0, "step": 3218, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1728515625, "learning_rate": 0.0007468022975056357, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 4703448.0, "repeat_count": 0.0, "routers_loss": 0.0005067543825134635, "skip_count": 0.0, "step": 3220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.119140625, "learning_rate": 0.0007464649145798596, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4706318.0, "repeat_count": 0.0, "routers_loss": 0.0007291951915249228, "skip_count": 1.0, "step": 3222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.193359375, "learning_rate": 0.0007461273833528438, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 4708882.0, "repeat_count": 0.0, "routers_loss": 0.0013625886058434844, "skip_count": 2.0, "step": 3224, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 40.0, "epoch": 18.757465404224327, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0007457897040276853, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 4711629.0, "repeat_count": 4.0, "routers_loss": 0.0015061062294989824, "skip_count": 0.0, "step": 3226, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2109375, "learning_rate": 0.0007454518768075704, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 4714439.0, "repeat_count": 0.0, "routers_loss": 0.0039747669361531734, "skip_count": 0.0, "step": 3228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 18.780772032046613, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.2119140625, "learning_rate": 0.0007451139018957743, "loss": 0.0058, "macro_f1": 0.5950249433517456, "num_tokens": 4717104.0, "repeat_count": 0.0, "routers_loss": 0.015309740789234638, "skip_count": 3.0, "step": 3230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.0007447757794956609, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 4720853.0, "repeat_count": 0.0, "routers_loss": 0.0009922216413542628, "skip_count": 0.0, "step": 3232, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.146484375, "learning_rate": 0.0007444375098106831, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4723748.0, "repeat_count": 0.0, "routers_loss": 0.009861153550446033, "skip_count": 1.0, "step": 3234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1923828125, "learning_rate": 0.000744099093044382, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 4726519.0, "repeat_count": 0.0, "routers_loss": 0.0012597906170412898, "skip_count": 0.0, "step": 3236, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.827385287691186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.169921875, "learning_rate": 0.0007437605294003881, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 4729307.0, "repeat_count": 0.0, "routers_loss": 0.0061767022125422955, "skip_count": 2.0, "step": 3238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.000743421819082419, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 4732151.0, "repeat_count": 0.0, "routers_loss": 0.003332817694172263, "skip_count": 0.0, "step": 3240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.8333333134651184, "avg_layers": 31.0, "epoch": 18.850691915513472, "f1_execute": 0.9836065173149109, "f1_repeat": 0.0, "f1_skip": 0.9090909361839294, "grad_norm": 0.1201171875, "learning_rate": 0.0007430829622942816, "loss": 0.0076, "macro_f1": 0.63089919090271, "num_tokens": 4735565.0, "repeat_count": 0.0, "routers_loss": 0.015167932026088238, "skip_count": 6.0, "step": 3242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0007427439592398706, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 4738562.0, "repeat_count": 0.0, "routers_loss": 0.0014353095320984721, "skip_count": 0.0, "step": 3244, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 18.873998543335762, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0007424048101231686, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 4741585.0, "repeat_count": 1.0, "routers_loss": 0.0019964331295341253, "skip_count": 0.0, "step": 3246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.885651857246906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12255859375, "learning_rate": 0.0007420655151482461, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 4744593.0, "repeat_count": 0.0, "routers_loss": 0.0012160304468125105, "skip_count": 0.0, "step": 3248, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.0007417260745192615, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 4747761.0, "repeat_count": 0.0, "routers_loss": 0.0024314275942742825, "skip_count": 0.0, "step": 3250, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.800000011920929, "avg_layers": 33.0, "epoch": 18.908958485069192, "f1_execute": 0.9836065173149109, "f1_repeat": 1.0, "f1_skip": 0.888888955116272, "grad_norm": 0.138671875, "learning_rate": 0.0007413864884404607, "loss": 0.0046, "macro_f1": 0.9574984908103943, "num_tokens": 4750840.0, "repeat_count": 1.0, "routers_loss": 0.017925940454006195, "skip_count": 5.0, "step": 3252, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1396484375, "learning_rate": 0.0007410467571161772, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 4753622.0, "repeat_count": 0.0, "routers_loss": 0.0007057868060655892, "skip_count": 0.0, "step": 3254, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10107421875, "learning_rate": 0.0007407068807508321, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 4756921.0, "repeat_count": 0.0, "routers_loss": 0.0015349991153925657, "skip_count": 1.0, "step": 3256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.0007403668595489332, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 4760013.0, "repeat_count": 0.0, "routers_loss": 0.0032711774110794067, "skip_count": 2.0, "step": 3258, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 18.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1630859375, "learning_rate": 0.0007400266937150761, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 4762859.0, "repeat_count": 0.0, "routers_loss": 0.00689981784671545, "skip_count": 1.0, "step": 3260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.967225054624908, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.0007396863834539431, "loss": 0.0102, "macro_f1": 0.32863849401474, "num_tokens": 4765375.0, "repeat_count": 0.0, "routers_loss": 0.017660168930888176, "skip_count": 1.0, "step": 3262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 18.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.177734375, "learning_rate": 0.0007393459289703035, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 4768244.0, "repeat_count": 0.0, "routers_loss": 0.007000640965998173, "skip_count": 0.0, "step": 3264, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 18.990531682447195, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.15234375, "learning_rate": 0.0007390053304690131, "loss": 0.0077, "macro_f1": 0.6616915464401245, "num_tokens": 4771117.0, "repeat_count": 1.0, "routers_loss": 0.029118994250893593, "skip_count": 2.0, "step": 3266, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0007386645881550145, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 4773104.0, "repeat_count": 1.0, "routers_loss": 0.0016860353061929345, "skip_count": 0.0, "step": 3268, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.011653313911143, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.0007383237022333374, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 4776759.0, "repeat_count": 1.0, "routers_loss": 0.0012642483925446868, "skip_count": 0.0, "step": 3270, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0007379826729090968, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4780322.0, "repeat_count": 0.0, "routers_loss": 0.002168776234611869, "skip_count": 0.0, "step": 3272, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 19.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.000737641500387495, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 4783034.0, "repeat_count": 2.0, "routers_loss": 0.0017636904958635569, "skip_count": 2.0, "step": 3274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0007373001848738202, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 4785752.0, "repeat_count": 0.0, "routers_loss": 0.0025314681697636843, "skip_count": 0.0, "step": 3276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 33.0, "epoch": 19.058266569555716, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1572265625, "learning_rate": 0.0007369587265734463, "loss": 0.0061, "macro_f1": 0.5454546213150024, "num_tokens": 4788851.0, "repeat_count": 0.0, "routers_loss": 0.019259991124272346, "skip_count": 3.0, "step": 3278, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23046875, "learning_rate": 0.0007366171256918334, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4791727.0, "repeat_count": 0.0, "routers_loss": 0.0009089522063732147, "skip_count": 0.0, "step": 3280, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.404296875, "learning_rate": 0.0007362753824345271, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4794239.0, "repeat_count": 0.0, "routers_loss": 0.0039035894442349672, "skip_count": 3.0, "step": 3282, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.09322651128915, "f1_execute": 0.9830508232116699, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 0.228515625, "learning_rate": 0.0007359334970071591, "loss": 0.0067, "macro_f1": 0.9276835918426514, "num_tokens": 4797777.0, "repeat_count": 3.0, "routers_loss": 0.012932924553751945, "skip_count": 4.0, "step": 3284, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.104879825200292, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.166015625, "learning_rate": 0.0007355914696154464, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 4800310.0, "repeat_count": 1.0, "routers_loss": 0.004200110211968422, "skip_count": 2.0, "step": 3286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.0007352493004651916, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 4803803.0, "repeat_count": 0.0, "routers_loss": 0.0016204684507101774, "skip_count": 0.0, "step": 3288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1474609375, "learning_rate": 0.0007349069897622822, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 4806623.0, "repeat_count": 0.0, "routers_loss": 0.008925621397793293, "skip_count": 2.0, "step": 3290, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 19.139839766933722, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0693359375, "learning_rate": 0.0007345645377126915, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 4809751.0, "repeat_count": 1.0, "routers_loss": 0.0012932125246152282, "skip_count": 1.0, "step": 3292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1220703125, "learning_rate": 0.000734221944522477, "loss": 0.0068, "macro_f1": 0.3333333432674408, "num_tokens": 4812520.0, "repeat_count": 0.0, "routers_loss": 0.0036725299432873726, "skip_count": 0.0, "step": 3294, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.16314639475601, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09326171875, "learning_rate": 0.0007338792103977821, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 4815608.0, "repeat_count": 1.0, "routers_loss": 0.001730661722831428, "skip_count": 0.0, "step": 3296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0810546875, "learning_rate": 0.0007335363355448341, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 4818032.0, "repeat_count": 0.0, "routers_loss": 0.0016334244282916188, "skip_count": 2.0, "step": 3298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08203125, "learning_rate": 0.0007331933201699457, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 4820574.0, "repeat_count": 0.0, "routers_loss": 0.0006953898700885475, "skip_count": 0.0, "step": 3300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.099609375, "learning_rate": 0.0007328501644795136, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 4823833.0, "repeat_count": 0.0, "routers_loss": 0.0015705337282270193, "skip_count": 0.0, "step": 3302, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 19.20975965040058, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.09521484375, "learning_rate": 0.0007325068686800194, "loss": 0.0053, "macro_f1": 0.928205132484436, "num_tokens": 4827034.0, "repeat_count": 1.0, "routers_loss": 0.013484926894307137, "skip_count": 3.0, "step": 3304, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 19.221412964311725, "f1_execute": 0.9629629254341125, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.126953125, "learning_rate": 0.0007321634329780285, "loss": 0.0057, "macro_f1": 0.920987606048584, "num_tokens": 4829608.0, "repeat_count": 4.0, "routers_loss": 0.025853270664811134, "skip_count": 6.0, "step": 3306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.054443359375, "learning_rate": 0.0007318198575801911, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 4833439.0, "repeat_count": 0.0, "routers_loss": 0.001796884462237358, "skip_count": 0.0, "step": 3308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1103515625, "learning_rate": 0.0007314761426932408, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 4836529.0, "repeat_count": 0.0, "routers_loss": 0.0035893742460757494, "skip_count": 1.0, "step": 3310, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0007311322885239957, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 4839361.0, "repeat_count": 0.0, "routers_loss": 0.006187157705426216, "skip_count": 1.0, "step": 3312, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 19.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05078125, "learning_rate": 0.0007307882952793574, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 4842855.0, "repeat_count": 1.0, "routers_loss": 0.0020316634327173233, "skip_count": 1.0, "step": 3314, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.000730444163166311, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 4845388.0, "repeat_count": 0.0, "routers_loss": 0.001022032112814486, "skip_count": 0.0, "step": 3316, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.291332847778587, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0007300998923919259, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 4848301.0, "repeat_count": 1.0, "routers_loss": 0.005321778357028961, "skip_count": 0.0, "step": 3318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.000729755483163354, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 4852019.0, "repeat_count": 0.0, "routers_loss": 0.003971325699239969, "skip_count": 0.0, "step": 3320, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.314639475600874, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.349609375, "learning_rate": 0.000729410935687831, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 4854725.0, "repeat_count": 1.0, "routers_loss": 0.0017381304642185569, "skip_count": 0.0, "step": 3322, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 37.0, "epoch": 19.326292789512017, "f1_execute": 0.9836065173149109, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.12890625, "learning_rate": 0.0007290662501726759, "loss": 0.0064, "macro_f1": 0.9278688430786133, "num_tokens": 4857724.0, "repeat_count": 3.0, "routers_loss": 0.01989312656223774, "skip_count": 3.0, "step": 3324, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0007287214268252904, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 4861323.0, "repeat_count": 0.0, "routers_loss": 0.0014093025820329785, "skip_count": 0.0, "step": 3326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0966796875, "learning_rate": 0.0007283764658531595, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4864395.0, "repeat_count": 0.0, "routers_loss": 0.0033239838667213917, "skip_count": 2.0, "step": 3328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08203125, "learning_rate": 0.0007280313674638508, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 4867138.0, "repeat_count": 0.0, "routers_loss": 0.0023634331300854683, "skip_count": 3.0, "step": 3330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1220703125, "learning_rate": 0.0007276861318650145, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 4870089.0, "repeat_count": 0.0, "routers_loss": 0.001917628338560462, "skip_count": 1.0, "step": 3332, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0007273407592643836, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 4872945.0, "repeat_count": 0.0, "routers_loss": 0.0020468488801270723, "skip_count": 3.0, "step": 3334, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0007269952498697733, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 4876038.0, "repeat_count": 0.0, "routers_loss": 0.005730155389755964, "skip_count": 1.0, "step": 3336, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.407865986890023, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0007266496038890815, "loss": 0.0042, "macro_f1": 1.0, "num_tokens": 4879063.0, "repeat_count": 1.0, "routers_loss": 0.004384960513561964, "skip_count": 4.0, "step": 3338, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1064453125, "learning_rate": 0.0007263038215302878, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 4881911.0, "repeat_count": 1.0, "routers_loss": 0.00197159918025136, "skip_count": 0.0, "step": 3340, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.43117261471231, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.000725957903001454, "loss": 0.0043, "macro_f1": 0.661835789680481, "num_tokens": 4884601.0, "repeat_count": 1.0, "routers_loss": 0.012392993085086346, "skip_count": 1.0, "step": 3342, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.0007256118485107242, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 4887417.0, "repeat_count": 0.0, "routers_loss": 0.0024973975960165262, "skip_count": 0.0, "step": 3344, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.171875, "learning_rate": 0.0007252656582663236, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4890543.0, "repeat_count": 1.0, "routers_loss": 0.000879391620401293, "skip_count": 0.0, "step": 3346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0007249193324765598, "loss": 0.0082, "macro_f1": 0.3333333432674408, "num_tokens": 4893425.0, "repeat_count": 0.0, "routers_loss": 0.006613228470087051, "skip_count": 0.0, "step": 3348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.477785870356882, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.0007245728713498219, "loss": 0.0066, "macro_f1": 0.32863849401474, "num_tokens": 4896160.0, "repeat_count": 0.0, "routers_loss": 0.013910067267715931, "skip_count": 1.0, "step": 3350, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 19.489439184268026, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.25, "learning_rate": 0.0007242262750945796, "loss": 0.0065, "macro_f1": 0.8839138746261597, "num_tokens": 4899269.0, "repeat_count": 1.0, "routers_loss": 0.01171480305492878, "skip_count": 2.0, "step": 3352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0007238795439193849, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 4902519.0, "repeat_count": 0.0, "routers_loss": 0.0008653972763568163, "skip_count": 0.0, "step": 3354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0007235326780328706, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 4905111.0, "repeat_count": 0.0, "routers_loss": 0.002062977757304907, "skip_count": 0.0, "step": 3356, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12451171875, "learning_rate": 0.0007231856776437502, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 4908323.0, "repeat_count": 0.0, "routers_loss": 0.006798225454986095, "skip_count": 3.0, "step": 3358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1005859375, "learning_rate": 0.000722838542960819, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 4910834.0, "repeat_count": 0.0, "routers_loss": 0.005453518591821194, "skip_count": 2.0, "step": 3360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0007224912741929522, "loss": 0.008, "macro_f1": 0.3333333432674408, "num_tokens": 4913813.0, "repeat_count": 0.0, "routers_loss": 0.003534254152327776, "skip_count": 0.0, "step": 3362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.0007221438715491063, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 4916470.0, "repeat_count": 0.0, "routers_loss": 0.0012371462071314454, "skip_count": 0.0, "step": 3364, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.57101238164603, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.0007217963352383181, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 4919127.0, "repeat_count": 1.0, "routers_loss": 0.0058456952683627605, "skip_count": 4.0, "step": 3366, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.0007214486654697046, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 4921664.0, "repeat_count": 0.0, "routers_loss": 0.004235987085849047, "skip_count": 0.0, "step": 3368, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.594319009468318, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.0007211008624524635, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 4924389.0, "repeat_count": 1.0, "routers_loss": 0.005105081479996443, "skip_count": 0.0, "step": 3370, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0007207529263958726, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 4928017.0, "repeat_count": 0.0, "routers_loss": 0.004433147609233856, "skip_count": 2.0, "step": 3372, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12158203125, "learning_rate": 0.0007204048575092896, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 4930360.0, "repeat_count": 0.0, "routers_loss": 0.0018072854727506638, "skip_count": 0.0, "step": 3374, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 19.629278951201748, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1796875, "learning_rate": 0.0007200566560021524, "loss": 0.0048, "macro_f1": 0.8839138746261597, "num_tokens": 4933002.0, "repeat_count": 1.0, "routers_loss": 0.019089223816990852, "skip_count": 2.0, "step": 3376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.23828125, "learning_rate": 0.0007197083220839784, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 4935582.0, "repeat_count": 0.0, "routers_loss": 0.0013629193417727947, "skip_count": 0.0, "step": 3378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1376953125, "learning_rate": 0.0007193598559643647, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 4938374.0, "repeat_count": 0.0, "routers_loss": 0.0020992171484977007, "skip_count": 1.0, "step": 3380, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 39.0, "epoch": 19.664238892935177, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.000719011257852988, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 4941289.0, "repeat_count": 5.0, "routers_loss": 0.0013210212346166372, "skip_count": 2.0, "step": 3382, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 19.67589220684632, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11865234375, "learning_rate": 0.0007186625279596044, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 4944115.0, "repeat_count": 1.0, "routers_loss": 0.003343864344060421, "skip_count": 1.0, "step": 3384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11865234375, "learning_rate": 0.0007183136664940498, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 4946558.0, "repeat_count": 0.0, "routers_loss": 0.0013231178745627403, "skip_count": 0.0, "step": 3386, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10498046875, "learning_rate": 0.0007179646736662382, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 4949312.0, "repeat_count": 0.0, "routers_loss": 0.0026451987214386463, "skip_count": 1.0, "step": 3388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.0007176155496861638, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4952727.0, "repeat_count": 0.0, "routers_loss": 0.0026686282362788916, "skip_count": 0.0, "step": 3390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11474609375, "learning_rate": 0.0007172662947638988, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 4955294.0, "repeat_count": 0.0, "routers_loss": 0.006303757429122925, "skip_count": 3.0, "step": 3392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09912109375, "learning_rate": 0.0007169169091095948, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 4958501.0, "repeat_count": 0.0, "routers_loss": 0.010137793608009815, "skip_count": 2.0, "step": 3394, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.745812090313184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.134765625, "learning_rate": 0.0007165673929334815, "loss": 0.008, "macro_f1": 1.0, "num_tokens": 4961254.0, "repeat_count": 1.0, "routers_loss": 0.00535039184615016, "skip_count": 4.0, "step": 3396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0007162177464458678, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 4964126.0, "repeat_count": 0.0, "routers_loss": 0.002991702873259783, "skip_count": 0.0, "step": 3398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0007158679698571407, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 4966806.0, "repeat_count": 0.0, "routers_loss": 0.0023024086840450764, "skip_count": 0.0, "step": 3400, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 19.780772032046613, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1181640625, "learning_rate": 0.000715518063377765, "loss": 0.0065, "macro_f1": 0.6139194369316101, "num_tokens": 4969309.0, "repeat_count": 0.0, "routers_loss": 0.02125658467411995, "skip_count": 4.0, "step": 3402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1416015625, "learning_rate": 0.0007151680272182847, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 4971969.0, "repeat_count": 0.0, "routers_loss": 0.0025149318389594555, "skip_count": 1.0, "step": 3404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 19.8040786598689, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.177734375, "learning_rate": 0.0007148178615893208, "loss": 0.0048, "macro_f1": 0.5507246255874634, "num_tokens": 4974414.0, "repeat_count": 0.0, "routers_loss": 0.008116615936160088, "skip_count": 2.0, "step": 3406, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.815731973780043, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.080078125, "learning_rate": 0.000714467566701573, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 4977460.0, "repeat_count": 1.0, "routers_loss": 0.004420279525220394, "skip_count": 0.0, "step": 3408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 19.827385287691186, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08642578125, "learning_rate": 0.0007141171427658182, "loss": 0.006, "macro_f1": 0.5507246255874634, "num_tokens": 4980401.0, "repeat_count": 0.0, "routers_loss": 0.006903476547449827, "skip_count": 2.0, "step": 3410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0007137665899929111, "loss": 0.0115, "macro_f1": 0.3333333432674408, "num_tokens": 4982957.0, "repeat_count": 0.0, "routers_loss": 0.0012852373765781522, "skip_count": 0.0, "step": 3412, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.850691915513472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1689453125, "learning_rate": 0.0007134159085937841, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 4986316.0, "repeat_count": 0.0, "routers_loss": 0.003480031620711088, "skip_count": 1.0, "step": 3414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12255859375, "learning_rate": 0.000713065098779447, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 4988967.0, "repeat_count": 0.0, "routers_loss": 0.001699127722531557, "skip_count": 2.0, "step": 3416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.873998543335762, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0007127141607609867, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 4991835.0, "repeat_count": 0.0, "routers_loss": 0.0016233375063166022, "skip_count": 0.0, "step": 3418, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.885651857246906, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0007123630947495672, "loss": 0.0066, "macro_f1": 0.661835789680481, "num_tokens": 4994691.0, "repeat_count": 1.0, "routers_loss": 0.007585971150547266, "skip_count": 1.0, "step": 3420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2470703125, "learning_rate": 0.0007120119009564298, "loss": 0.0085, "macro_f1": 0.6666666865348816, "num_tokens": 4998592.0, "repeat_count": 0.0, "routers_loss": 0.0029182215221226215, "skip_count": 2.0, "step": 3422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.0007116605795928925, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 5002381.0, "repeat_count": 0.0, "routers_loss": 0.002327726222574711, "skip_count": 0.0, "step": 3424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0007113091308703497, "loss": 0.0065, "macro_f1": 0.3333333432674408, "num_tokens": 5004949.0, "repeat_count": 0.0, "routers_loss": 0.0017768351826816797, "skip_count": 0.0, "step": 3426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 19.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09912109375, "learning_rate": 0.0007109575550002733, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5007720.0, "repeat_count": 0.0, "routers_loss": 0.005772108677774668, "skip_count": 2.0, "step": 3428, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 19.94391842680262, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1171875, "learning_rate": 0.0007106058521942109, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 5011276.0, "repeat_count": 3.0, "routers_loss": 0.005547509994357824, "skip_count": 6.0, "step": 3430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0007102540226637869, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5013864.0, "repeat_count": 0.0, "routers_loss": 0.0013262206921353936, "skip_count": 0.0, "step": 3432, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 19.967225054624908, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.0007099020666207018, "loss": 0.0065, "macro_f1": 0.661835789680481, "num_tokens": 5016845.0, "repeat_count": 1.0, "routers_loss": 0.01161392591893673, "skip_count": 1.0, "step": 3434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 19.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.087890625, "learning_rate": 0.0007095499842767323, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 5019466.0, "repeat_count": 0.0, "routers_loss": 0.004726933315396309, "skip_count": 1.0, "step": 3436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 19.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10693359375, "learning_rate": 0.0007091977758437311, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5021982.0, "repeat_count": 0.0, "routers_loss": 0.0019450897816568613, "skip_count": 0.0, "step": 3438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0007088454415336265, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5024320.0, "repeat_count": 0.0, "routers_loss": 0.003659927984699607, "skip_count": 0.0, "step": 3440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.0007084929815584231, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5027326.0, "repeat_count": 0.0, "routers_loss": 0.0011477689258754253, "skip_count": 2.0, "step": 3442, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.023306627822286, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.0007081403961302007, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 5030451.0, "repeat_count": 1.0, "routers_loss": 0.002225241158157587, "skip_count": 3.0, "step": 3444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.03495994173343, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.0007077876854611145, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5033401.0, "repeat_count": 0.0, "routers_loss": 0.0057023377157747746, "skip_count": 2.0, "step": 3446, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.0007074348497633952, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5036104.0, "repeat_count": 0.0, "routers_loss": 0.007414532359689474, "skip_count": 1.0, "step": 3448, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 20.058266569555716, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0007070818892493491, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 5038827.0, "repeat_count": 1.0, "routers_loss": 0.0015464614843949676, "skip_count": 0.0, "step": 3450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.06991988346686, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0007067288041313571, "loss": 0.0065, "macro_f1": 0.32863849401474, "num_tokens": 5041300.0, "repeat_count": 0.0, "routers_loss": 0.01192833948880434, "skip_count": 1.0, "step": 3452, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 20.081573197378006, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.171875, "learning_rate": 0.0007063755946218751, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 5044318.0, "repeat_count": 1.0, "routers_loss": 0.002514195628464222, "skip_count": 1.0, "step": 3454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0007060222609334342, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 5047374.0, "repeat_count": 0.0, "routers_loss": 0.0008756173774600029, "skip_count": 0.0, "step": 3456, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.104879825200292, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09814453125, "learning_rate": 0.0007056688032786398, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 5050811.0, "repeat_count": 1.0, "routers_loss": 0.00244494853541255, "skip_count": 2.0, "step": 3458, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 20.116533139111436, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1142578125, "learning_rate": 0.0007053152218701724, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 5053400.0, "repeat_count": 1.0, "routers_loss": 0.004801429342478514, "skip_count": 1.0, "step": 3460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.0007049615169207864, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 5056339.0, "repeat_count": 0.0, "routers_loss": 0.008154598996043205, "skip_count": 3.0, "step": 3462, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 20.139839766933722, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.076171875, "learning_rate": 0.0007046076886433109, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 5059972.0, "repeat_count": 1.0, "routers_loss": 0.003514054697006941, "skip_count": 1.0, "step": 3464, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0007042537372506493, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5062874.0, "repeat_count": 0.0, "routers_loss": 0.005593776702880859, "skip_count": 0.0, "step": 3466, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 20.16314639475601, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06396484375, "learning_rate": 0.0007038996629557783, "loss": 0.0046, "macro_f1": 0.661835789680481, "num_tokens": 5065932.0, "repeat_count": 1.0, "routers_loss": 0.014209412969648838, "skip_count": 1.0, "step": 3468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1259765625, "learning_rate": 0.00070354546597175, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5068508.0, "repeat_count": 0.0, "routers_loss": 0.0008326509851031005, "skip_count": 0.0, "step": 3470, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.186453022578295, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1826171875, "learning_rate": 0.0007031911465116887, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 5071035.0, "repeat_count": 1.0, "routers_loss": 0.002910683862864971, "skip_count": 3.0, "step": 3472, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 20.198106336489438, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0007028367047887935, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5074450.0, "repeat_count": 1.0, "routers_loss": 0.0026570719201117754, "skip_count": 0.0, "step": 3474, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.20975965040058, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11376953125, "learning_rate": 0.0007024821410163368, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 5077151.0, "repeat_count": 1.0, "routers_loss": 0.010277781635522842, "skip_count": 4.0, "step": 3476, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 20.221412964311725, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.16796875, "learning_rate": 0.0007021274554076644, "loss": 0.0052, "macro_f1": 0.9470900297164917, "num_tokens": 5079929.0, "repeat_count": 1.0, "routers_loss": 0.0070365965366363525, "skip_count": 4.0, "step": 3478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 20.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.181640625, "learning_rate": 0.0007017726481761951, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 5082308.0, "repeat_count": 0.0, "routers_loss": 0.002784114796668291, "skip_count": 5.0, "step": 3480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.0007014177195354213, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5085107.0, "repeat_count": 0.0, "routers_loss": 0.000821599387563765, "skip_count": 0.0, "step": 3482, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.256372906045158, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07421875, "learning_rate": 0.0007010626696989085, "loss": 0.0034, "macro_f1": 1.0, "num_tokens": 5087705.0, "repeat_count": 1.0, "routers_loss": 0.007924535311758518, "skip_count": 4.0, "step": 3484, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10107421875, "learning_rate": 0.0007007074988802946, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 5090755.0, "repeat_count": 2.0, "routers_loss": 0.007092305459082127, "skip_count": 3.0, "step": 3486, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 20.279679533867444, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0007003522072932908, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5093642.0, "repeat_count": 1.0, "routers_loss": 0.0015567619120702147, "skip_count": 0.0, "step": 3488, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.291332847778587, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12060546875, "learning_rate": 0.0006999967951516811, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 5096983.0, "repeat_count": 1.0, "routers_loss": 0.0013171395985409617, "skip_count": 2.0, "step": 3490, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 20.30298616168973, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0006996412626693214, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 5100264.0, "repeat_count": 1.0, "routers_loss": 0.0008490347536280751, "skip_count": 0.0, "step": 3492, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.0006992856100601403, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5103124.0, "repeat_count": 0.0, "routers_loss": 0.009042516350746155, "skip_count": 2.0, "step": 3494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.000698929837538139, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 5106012.0, "repeat_count": 0.0, "routers_loss": 0.001787294982932508, "skip_count": 0.0, "step": 3496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2333984375, "learning_rate": 0.0006985739453173903, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5109629.0, "repeat_count": 0.0, "routers_loss": 0.0006185895181261003, "skip_count": 0.0, "step": 3498, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.349599417334304, "f1_execute": 0.9666666388511658, "f1_repeat": 0.5, "f1_skip": 1.0, "grad_norm": 0.1044921875, "learning_rate": 0.0006982179336120396, "loss": 0.0054, "macro_f1": 0.8222222328186035, "num_tokens": 5112327.0, "repeat_count": 3.0, "routers_loss": 0.05509005859494209, "skip_count": 4.0, "step": 3500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1513671875, "learning_rate": 0.0006978618026363037, "loss": 0.0076, "macro_f1": 0.3333333432674408, "num_tokens": 5114888.0, "repeat_count": 0.0, "routers_loss": 0.0007687332690693438, "skip_count": 0.0, "step": 3502, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.37290604515659, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1708984375, "learning_rate": 0.0006975055526044716, "loss": 0.008, "macro_f1": 0.8837606906890869, "num_tokens": 5118071.0, "repeat_count": 2.0, "routers_loss": 0.02829297073185444, "skip_count": 2.0, "step": 3504, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 35.0, "epoch": 20.384559359067733, "f1_execute": 0.9836065173149109, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1611328125, "learning_rate": 0.0006971491837309034, "loss": 0.0081, "macro_f1": 0.9469165205955505, "num_tokens": 5121044.0, "repeat_count": 2.0, "routers_loss": 0.017110375687479973, "skip_count": 4.0, "step": 3506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.181640625, "learning_rate": 0.0006967926962300314, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5123990.0, "repeat_count": 0.0, "routers_loss": 0.0015549641102552414, "skip_count": 2.0, "step": 3508, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 20.407865986890023, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.08203125, "learning_rate": 0.0006964360903163586, "loss": 0.0065, "macro_f1": 0.9470900297164917, "num_tokens": 5126608.0, "repeat_count": 1.0, "routers_loss": 0.021001240238547325, "skip_count": 4.0, "step": 3510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.419519300801166, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0006960793662044595, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5129390.0, "repeat_count": 0.0, "routers_loss": 0.002139861462637782, "skip_count": 0.0, "step": 3512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.0006957225241089801, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5132007.0, "repeat_count": 0.0, "routers_loss": 0.001974372426047921, "skip_count": 2.0, "step": 3514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10009765625, "learning_rate": 0.0006953655642446368, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5135794.0, "repeat_count": 0.0, "routers_loss": 0.0024535437114536762, "skip_count": 2.0, "step": 3516, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.134765625, "learning_rate": 0.0006950084868262175, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 5138656.0, "repeat_count": 3.0, "routers_loss": 0.0013523731613531709, "skip_count": 4.0, "step": 3518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2177734375, "learning_rate": 0.00069465129206858, "loss": 0.0081, "macro_f1": 0.3333333432674408, "num_tokens": 5141615.0, "repeat_count": 0.0, "routers_loss": 0.002457247581332922, "skip_count": 0.0, "step": 3520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0006942939801866532, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5144606.0, "repeat_count": 0.0, "routers_loss": 0.0017478509107604623, "skip_count": 0.0, "step": 3522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1826171875, "learning_rate": 0.0006939365513954368, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5147588.0, "repeat_count": 0.0, "routers_loss": 0.002008304698392749, "skip_count": 0.0, "step": 3524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 20.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0830078125, "learning_rate": 0.0006935790059100003, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5150733.0, "repeat_count": 0.0, "routers_loss": 0.004287297371774912, "skip_count": 6.0, "step": 3526, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0006932213439454837, "loss": 0.0124, "macro_f1": 0.3333333432674408, "num_tokens": 5153699.0, "repeat_count": 0.0, "routers_loss": 0.0005050458130426705, "skip_count": 0.0, "step": 3528, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0006928635657170966, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5157901.0, "repeat_count": 0.0, "routers_loss": 0.0003393345687072724, "skip_count": 0.0, "step": 3530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.5360524399126, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09814453125, "learning_rate": 0.0006925056714401196, "loss": 0.0044, "macro_f1": 0.6616915464401245, "num_tokens": 5160926.0, "repeat_count": 1.0, "routers_loss": 0.021078523248434067, "skip_count": 2.0, "step": 3532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0006921476613299018, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 5163784.0, "repeat_count": 0.0, "routers_loss": 0.0006570091936737299, "skip_count": 0.0, "step": 3534, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 20.55935906773489, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12060546875, "learning_rate": 0.0006917895356018633, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5166735.0, "repeat_count": 2.0, "routers_loss": 0.004633928649127483, "skip_count": 0.0, "step": 3536, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1650390625, "learning_rate": 0.0006914312944714927, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5169728.0, "repeat_count": 0.0, "routers_loss": 0.004678684752434492, "skip_count": 1.0, "step": 3538, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.0006910729381543486, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 5172463.0, "repeat_count": 0.0, "routers_loss": 0.004172485787421465, "skip_count": 0.0, "step": 3540, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.594319009468318, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1591796875, "learning_rate": 0.000690714466866059, "loss": 0.0074, "macro_f1": 0.8837606906890869, "num_tokens": 5175254.0, "repeat_count": 2.0, "routers_loss": 0.013502174988389015, "skip_count": 2.0, "step": 3542, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 20.60597232337946, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.15234375, "learning_rate": 0.0006903558808223205, "loss": 0.0041, "macro_f1": 0.5507246255874634, "num_tokens": 5177936.0, "repeat_count": 0.0, "routers_loss": 0.014736099168658257, "skip_count": 2.0, "step": 3544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06689453125, "learning_rate": 0.0006899971802388996, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5180714.0, "repeat_count": 0.0, "routers_loss": 0.002718047471717, "skip_count": 0.0, "step": 3546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 20.629278951201748, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.134765625, "learning_rate": 0.0006896383653316307, "loss": 0.0063, "macro_f1": 0.5950249433517456, "num_tokens": 5183099.0, "repeat_count": 0.0, "routers_loss": 0.006830880884081125, "skip_count": 3.0, "step": 3548, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.64093226511289, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.000689279436316418, "loss": 0.0078, "macro_f1": 0.32863849401474, "num_tokens": 5186473.0, "repeat_count": 1.0, "routers_loss": 0.018640121445059776, "skip_count": 0.0, "step": 3550, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.652585579024034, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0006889203934092337, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 5189333.0, "repeat_count": 1.0, "routers_loss": 0.003072752384468913, "skip_count": 2.0, "step": 3552, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.664238892935177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0859375, "learning_rate": 0.0006885612368261186, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 5191939.0, "repeat_count": 0.0, "routers_loss": 0.0010331302182748914, "skip_count": 0.0, "step": 3554, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0006882019667831822, "loss": 0.0078, "macro_f1": 0.3333333432674408, "num_tokens": 5194118.0, "repeat_count": 0.0, "routers_loss": 0.00411403039470315, "skip_count": 0.0, "step": 3556, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 20.687545520757464, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0712890625, "learning_rate": 0.000687842583496602, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 5197352.0, "repeat_count": 2.0, "routers_loss": 0.005907667335122824, "skip_count": 1.0, "step": 3558, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 20.699198834668607, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1240234375, "learning_rate": 0.000687483087182624, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 5200112.0, "repeat_count": 2.0, "routers_loss": 0.006927124224603176, "skip_count": 2.0, "step": 3560, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 20.710852148579754, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1650390625, "learning_rate": 0.0006871234780575616, "loss": 0.0077, "macro_f1": 0.5507246255874634, "num_tokens": 5203398.0, "repeat_count": 0.0, "routers_loss": 0.010892577469348907, "skip_count": 2.0, "step": 3562, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061767578125, "learning_rate": 0.0006867637563377966, "loss": 0.0072, "macro_f1": 0.3333333432674408, "num_tokens": 5207675.0, "repeat_count": 0.0, "routers_loss": 0.0015749590238556266, "skip_count": 0.0, "step": 3564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0006864039222397783, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5210750.0, "repeat_count": 0.0, "routers_loss": 0.0027824880089610815, "skip_count": 3.0, "step": 3566, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.745812090313184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0006860439759800238, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 5213427.0, "repeat_count": 1.0, "routers_loss": 0.006465950049459934, "skip_count": 3.0, "step": 3568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.757465404224327, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.20703125, "learning_rate": 0.0006856839177751175, "loss": 0.0057, "macro_f1": 0.5950249433517456, "num_tokens": 5216385.0, "repeat_count": 0.0, "routers_loss": 0.013347514905035496, "skip_count": 2.0, "step": 3570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1328125, "learning_rate": 0.0006853237478417111, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 5219419.0, "repeat_count": 0.0, "routers_loss": 0.0013470026897266507, "skip_count": 2.0, "step": 3572, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.109375, "learning_rate": 0.0006849634663965241, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5222095.0, "repeat_count": 0.0, "routers_loss": 0.004525866359472275, "skip_count": 2.0, "step": 3574, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 20.792425345957756, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2197265625, "learning_rate": 0.0006846030736563422, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 5224920.0, "repeat_count": 2.0, "routers_loss": 0.0016554460162296891, "skip_count": 2.0, "step": 3576, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 20.8040786598689, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1630859375, "learning_rate": 0.000684242569838019, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 5227637.0, "repeat_count": 1.0, "routers_loss": 0.0023649358190596104, "skip_count": 0.0, "step": 3578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 20.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1982421875, "learning_rate": 0.000683881955158474, "loss": 0.0083, "macro_f1": 0.6666666865348816, "num_tokens": 5230698.0, "repeat_count": 0.0, "routers_loss": 0.007323693949729204, "skip_count": 5.0, "step": 3580, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 20.827385287691186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0006835212298346941, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 5233183.0, "repeat_count": 5.0, "routers_loss": 0.004820866510272026, "skip_count": 8.0, "step": 3582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.0006831603940837327, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 5235984.0, "repeat_count": 0.0, "routers_loss": 0.0061857919208705425, "skip_count": 2.0, "step": 3584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.850691915513472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10302734375, "learning_rate": 0.0006827994481227092, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5238716.0, "repeat_count": 0.0, "routers_loss": 0.0024360883980989456, "skip_count": 0.0, "step": 3586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 20.86234522942462, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.107421875, "learning_rate": 0.0006824383921688097, "loss": 0.0066, "macro_f1": 0.6139194369316101, "num_tokens": 5241667.0, "repeat_count": 0.0, "routers_loss": 0.028699232265353203, "skip_count": 4.0, "step": 3588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 20.873998543335762, "f1_execute": 0.9714285731315613, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.30859375, "learning_rate": 0.0006820772264392868, "loss": 0.0076, "macro_f1": 0.32380953431129456, "num_tokens": 5244203.0, "repeat_count": 0.0, "routers_loss": 0.03088342770934105, "skip_count": 1.0, "step": 3590, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.885651857246906, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1796875, "learning_rate": 0.0006817159511514582, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 5246697.0, "repeat_count": 1.0, "routers_loss": 0.01050335168838501, "skip_count": 2.0, "step": 3592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0006813545665227085, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 5250350.0, "repeat_count": 0.0, "routers_loss": 0.002104497980326414, "skip_count": 0.0, "step": 3594, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 20.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.177734375, "learning_rate": 0.0006809930727704874, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5253084.0, "repeat_count": 0.0, "routers_loss": 0.003882719436660409, "skip_count": 0.0, "step": 3596, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 20.920611798980335, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.150390625, "learning_rate": 0.0006806314701123106, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 5255830.0, "repeat_count": 1.0, "routers_loss": 0.007497020997107029, "skip_count": 1.0, "step": 3598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1572265625, "learning_rate": 0.0006802697587657594, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5258451.0, "repeat_count": 0.0, "routers_loss": 0.0012090156087651849, "skip_count": 2.0, "step": 3600, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.94391842680262, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1689453125, "learning_rate": 0.00067990793894848, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 5260855.0, "repeat_count": 1.0, "routers_loss": 0.0011546133318915963, "skip_count": 3.0, "step": 3602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 20.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.087890625, "learning_rate": 0.0006795460108781847, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5264217.0, "repeat_count": 0.0, "routers_loss": 0.005146258510649204, "skip_count": 2.0, "step": 3604, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 20.967225054624908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.16015625, "learning_rate": 0.0006791839747726501, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5267233.0, "repeat_count": 2.0, "routers_loss": 0.008579482324421406, "skip_count": 0.0, "step": 3606, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 20.97887836853605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11181640625, "learning_rate": 0.0006788218308497185, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 5270515.0, "repeat_count": 1.0, "routers_loss": 0.004515102133154869, "skip_count": 2.0, "step": 3608, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 20.990531682447195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12890625, "learning_rate": 0.0006784595793272964, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 5273356.0, "repeat_count": 2.0, "routers_loss": 0.0024671857245266438, "skip_count": 1.0, "step": 3610, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1806640625, "learning_rate": 0.0006780972204233556, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 5275536.0, "repeat_count": 0.0, "routers_loss": 0.009937768802046776, "skip_count": 1.0, "step": 3612, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.011653313911143, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1484375, "learning_rate": 0.0006777347543559323, "loss": 0.0055, "macro_f1": 0.32863849401474, "num_tokens": 5278201.0, "repeat_count": 0.0, "routers_loss": 0.005236076656728983, "skip_count": 1.0, "step": 3614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0810546875, "learning_rate": 0.000677372181343127, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5282072.0, "repeat_count": 0.0, "routers_loss": 0.0023588095791637897, "skip_count": 2.0, "step": 3616, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0006770095016031051, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 5285056.0, "repeat_count": 1.0, "routers_loss": 0.003445093985646963, "skip_count": 3.0, "step": 3618, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.046613255644573, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09130859375, "learning_rate": 0.0006766467153540953, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 5287815.0, "repeat_count": 1.0, "routers_loss": 0.00391506589949131, "skip_count": 2.0, "step": 3620, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.216796875, "learning_rate": 0.0006762838228143916, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5290426.0, "repeat_count": 0.0, "routers_loss": 0.001963461982086301, "skip_count": 0.0, "step": 3622, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0006759208242023509, "loss": 0.0027, "macro_f1": 0.3333333432674408, "num_tokens": 5293339.0, "repeat_count": 0.0, "routers_loss": 0.0008344086818397045, "skip_count": 0.0, "step": 3624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2412109375, "learning_rate": 0.0006755577197363944, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5295946.0, "repeat_count": 0.0, "routers_loss": 0.0025097085162997246, "skip_count": 2.0, "step": 3626, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.000675194509635007, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5298532.0, "repeat_count": 0.0, "routers_loss": 0.000905053922906518, "skip_count": 0.0, "step": 3628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0006748311941167369, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5301990.0, "repeat_count": 0.0, "routers_loss": 0.008567328564822674, "skip_count": 0.0, "step": 3630, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 36.0, "epoch": 21.116533139111436, "f1_execute": 0.9696969985961914, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 0.0732421875, "learning_rate": 0.0006744677734001961, "loss": 0.0042, "macro_f1": 0.8232323527336121, "num_tokens": 5304888.0, "repeat_count": 1.0, "routers_loss": 0.018642589449882507, "skip_count": 3.0, "step": 3632, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 21.12818645302258, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0966796875, "learning_rate": 0.0006741042477040595, "loss": 0.0042, "macro_f1": 1.0, "num_tokens": 5307644.0, "repeat_count": 2.0, "routers_loss": 0.0069702500477433205, "skip_count": 2.0, "step": 3634, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.139839766933722, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0006737406172470657, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 5310149.0, "repeat_count": 1.0, "routers_loss": 0.0018521025776863098, "skip_count": 0.0, "step": 3636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 21.151493080844865, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10205078125, "learning_rate": 0.0006733768822480159, "loss": 0.0047, "macro_f1": 0.5507246255874634, "num_tokens": 5314259.0, "repeat_count": 0.0, "routers_loss": 0.008157706819474697, "skip_count": 2.0, "step": 3638, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.16314639475601, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09521484375, "learning_rate": 0.0006730130429257739, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5316943.0, "repeat_count": 1.0, "routers_loss": 0.003363900352269411, "skip_count": 0.0, "step": 3640, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10107421875, "learning_rate": 0.0006726490994992673, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 5319611.0, "repeat_count": 0.0, "routers_loss": 0.003208814887329936, "skip_count": 2.0, "step": 3642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0006722850521874855, "loss": 0.009, "macro_f1": 0.3333333432674408, "num_tokens": 5323039.0, "repeat_count": 0.0, "routers_loss": 0.0017036852659657598, "skip_count": 0.0, "step": 3644, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.198106336489438, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10986328125, "learning_rate": 0.0006719209012094805, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 5325544.0, "repeat_count": 1.0, "routers_loss": 0.005214404314756393, "skip_count": 3.0, "step": 3646, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.000671556646784367, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5328230.0, "repeat_count": 0.0, "routers_loss": 0.0011388149578124285, "skip_count": 1.0, "step": 3648, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0006711922891313218, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 5330890.0, "repeat_count": 0.0, "routers_loss": 0.0014066395815461874, "skip_count": 1.0, "step": 3650, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1142578125, "learning_rate": 0.0006708278284695836, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5333592.0, "repeat_count": 0.0, "routers_loss": 0.002691033761948347, "skip_count": 2.0, "step": 3652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1533203125, "learning_rate": 0.0006704632650184532, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5336280.0, "repeat_count": 0.0, "routers_loss": 0.00503140315413475, "skip_count": 2.0, "step": 3654, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 21.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.058837890625, "learning_rate": 0.0006700985989972937, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5339512.0, "repeat_count": 0.0, "routers_loss": 0.002173262881115079, "skip_count": 4.0, "step": 3656, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0654296875, "learning_rate": 0.0006697338306255291, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 5342250.0, "repeat_count": 2.0, "routers_loss": 0.0029777782037854195, "skip_count": 3.0, "step": 3658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0006693689601226458, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5344872.0, "repeat_count": 0.0, "routers_loss": 0.0008051639888435602, "skip_count": 0.0, "step": 3660, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.291332847778587, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0654296875, "learning_rate": 0.0006690039877081908, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 5347700.0, "repeat_count": 1.0, "routers_loss": 0.0008443260448984802, "skip_count": 0.0, "step": 3662, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.30298616168973, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0006686389136017734, "loss": 0.0135, "macro_f1": 1.0, "num_tokens": 5350868.0, "repeat_count": 1.0, "routers_loss": 0.003584991442039609, "skip_count": 2.0, "step": 3664, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0006682737380230633, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 5353754.0, "repeat_count": 0.0, "routers_loss": 0.0016730015631765127, "skip_count": 0.0, "step": 3666, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12158203125, "learning_rate": 0.0006679084611917914, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 5356748.0, "repeat_count": 0.0, "routers_loss": 0.0040176683105528355, "skip_count": 2.0, "step": 3668, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.33794610342316, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.00066754308332775, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 5359931.0, "repeat_count": 1.0, "routers_loss": 0.004662117455154657, "skip_count": 2.0, "step": 3670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0006671776046507916, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5364103.0, "repeat_count": 0.0, "routers_loss": 0.0005507589085027575, "skip_count": 0.0, "step": 3672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0006668120253808297, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 5366836.0, "repeat_count": 0.0, "routers_loss": 0.001940579037182033, "skip_count": 0.0, "step": 3674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 21.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0006664463457378382, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5369562.0, "repeat_count": 0.0, "routers_loss": 0.0057499282993376255, "skip_count": 4.0, "step": 3676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08544921875, "learning_rate": 0.0006660805659418516, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 5372372.0, "repeat_count": 0.0, "routers_loss": 0.0028365051839500666, "skip_count": 0.0, "step": 3678, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.39621267297888, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0006657146862129645, "loss": 0.0069, "macro_f1": 0.32863849401474, "num_tokens": 5375114.0, "repeat_count": 1.0, "routers_loss": 0.033615123480558395, "skip_count": 0.0, "step": 3680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1787109375, "learning_rate": 0.0006653487067713313, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 5377379.0, "repeat_count": 0.0, "routers_loss": 0.006058321334421635, "skip_count": 2.0, "step": 3682, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 21.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.058349609375, "learning_rate": 0.000664982627837167, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 5380063.0, "repeat_count": 1.0, "routers_loss": 0.0049779778346419334, "skip_count": 1.0, "step": 3684, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.43117261471231, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08544921875, "learning_rate": 0.0006646164496307461, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5382339.0, "repeat_count": 1.0, "routers_loss": 0.003556197742000222, "skip_count": 0.0, "step": 3686, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 21.442825928623453, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1181640625, "learning_rate": 0.000664250172372403, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 5385050.0, "repeat_count": 1.0, "routers_loss": 0.002700002631172538, "skip_count": 1.0, "step": 3688, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.17578125, "learning_rate": 0.0006638837962825317, "loss": 0.0079, "macro_f1": 1.0, "num_tokens": 5387630.0, "repeat_count": 2.0, "routers_loss": 0.004441831726580858, "skip_count": 4.0, "step": 3690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0810546875, "learning_rate": 0.0006635173215815853, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5391496.0, "repeat_count": 0.0, "routers_loss": 0.0009045200422406197, "skip_count": 0.0, "step": 3692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 21.477785870356882, "f1_execute": 0.9841269850730896, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.13671875, "learning_rate": 0.000663150748490077, "loss": 0.0048, "macro_f1": 0.6613757014274597, "num_tokens": 5393976.0, "repeat_count": 1.0, "routers_loss": 0.03596651926636696, "skip_count": 4.0, "step": 3694, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0006627840772285784, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5396607.0, "repeat_count": 0.0, "routers_loss": 0.0007029867847450078, "skip_count": 0.0, "step": 3696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 21.50109249817917, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1337890625, "learning_rate": 0.0006624173080177206, "loss": 0.0062, "macro_f1": 0.5507246255874634, "num_tokens": 5399835.0, "repeat_count": 0.0, "routers_loss": 0.010540982708334923, "skip_count": 2.0, "step": 3698, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.512745812090312, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1650390625, "learning_rate": 0.0006620504410781934, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 5402859.0, "repeat_count": 1.0, "routers_loss": 0.006274465937167406, "skip_count": 2.0, "step": 3700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.0006616834766307457, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 5405525.0, "repeat_count": 0.0, "routers_loss": 0.0016211277106776834, "skip_count": 0.0, "step": 3702, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1357421875, "learning_rate": 0.000661316414896185, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 5408352.0, "repeat_count": 2.0, "routers_loss": 0.0015957222785800695, "skip_count": 3.0, "step": 3704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1474609375, "learning_rate": 0.0006609492560953772, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 5411203.0, "repeat_count": 0.0, "routers_loss": 0.0008390289731323719, "skip_count": 1.0, "step": 3706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0006605820004492467, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5414092.0, "repeat_count": 0.0, "routers_loss": 0.0026837708428502083, "skip_count": 2.0, "step": 3708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0006602146481787758, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5417123.0, "repeat_count": 0.0, "routers_loss": 0.0017910475144162774, "skip_count": 0.0, "step": 3710, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0006598471995050056, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5421806.0, "repeat_count": 0.0, "routers_loss": 0.004797408822923899, "skip_count": 0.0, "step": 3712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 35.0, "epoch": 21.594319009468318, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.000659479654649035, "loss": 0.0056, "macro_f1": 0.32863849401474, "num_tokens": 5424989.0, "repeat_count": 0.0, "routers_loss": 0.008155900985002518, "skip_count": 0.0, "step": 3714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 21.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2373046875, "learning_rate": 0.0006591120138320204, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5428137.0, "repeat_count": 0.0, "routers_loss": 0.003104757983237505, "skip_count": 4.0, "step": 3716, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.617625637290605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09765625, "learning_rate": 0.0006587442772751764, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5431150.0, "repeat_count": 1.0, "routers_loss": 0.0035759673919528723, "skip_count": 0.0, "step": 3718, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.9166666865348816, "avg_layers": 28.0, "epoch": 21.629278951201748, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.95652174949646, "grad_norm": 0.08056640625, "learning_rate": 0.0006583764451997748, "loss": 0.0035, "macro_f1": 0.9777553081512451, "num_tokens": 5433648.0, "repeat_count": 3.0, "routers_loss": 0.01098781917244196, "skip_count": 12.0, "step": 3720, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0006580085178271454, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 5437408.0, "repeat_count": 0.0, "routers_loss": 0.0012582973577082157, "skip_count": 0.0, "step": 3722, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.652585579024034, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.158203125, "learning_rate": 0.0006576404953786747, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5439675.0, "repeat_count": 1.0, "routers_loss": 0.004184786695986986, "skip_count": 0.0, "step": 3724, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.664238892935177, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1240234375, "learning_rate": 0.0006572723780758069, "loss": 0.0076, "macro_f1": 0.6666666865348816, "num_tokens": 5442685.0, "repeat_count": 1.0, "routers_loss": 0.002518074819818139, "skip_count": 0.0, "step": 3726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2099609375, "learning_rate": 0.000656904166140043, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 5445115.0, "repeat_count": 0.0, "routers_loss": 0.0014018155634403229, "skip_count": 0.0, "step": 3728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0006565358597929411, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5447640.0, "repeat_count": 0.0, "routers_loss": 0.0004129740409553051, "skip_count": 0.0, "step": 3730, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.699198834668607, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.0006561674592561164, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 5450860.0, "repeat_count": 1.0, "routers_loss": 0.0015176617307588458, "skip_count": 0.0, "step": 3732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0006557989647512398, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 5453507.0, "repeat_count": 0.0, "routers_loss": 0.0052530341781675816, "skip_count": 0.0, "step": 3734, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.193359375, "learning_rate": 0.0006554303765000398, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 5456509.0, "repeat_count": 0.0, "routers_loss": 0.0023497045040130615, "skip_count": 0.0, "step": 3736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.267578125, "learning_rate": 0.0006550616947243008, "loss": 0.0079, "macro_f1": 0.3333333432674408, "num_tokens": 5459089.0, "repeat_count": 0.0, "routers_loss": 0.0007271248032338917, "skip_count": 0.0, "step": 3738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09521484375, "learning_rate": 0.0006546929196458635, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5462813.0, "repeat_count": 0.0, "routers_loss": 0.0043258448131382465, "skip_count": 0.0, "step": 3740, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 21.757465404224327, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.083984375, "learning_rate": 0.0006543240514866251, "loss": 0.0044, "macro_f1": 0.8839138746261597, "num_tokens": 5465791.0, "repeat_count": 1.0, "routers_loss": 0.007382458541542292, "skip_count": 2.0, "step": 3742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.091796875, "learning_rate": 0.0006539550904685379, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 5468258.0, "repeat_count": 0.0, "routers_loss": 0.00039956593536771834, "skip_count": 0.0, "step": 3744, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0771484375, "learning_rate": 0.0006535860368136113, "loss": 0.0058, "macro_f1": 0.3333333432674408, "num_tokens": 5470898.0, "repeat_count": 0.0, "routers_loss": 0.002978227101266384, "skip_count": 0.0, "step": 3746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08251953125, "learning_rate": 0.0006532168907439096, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 5474644.0, "repeat_count": 0.0, "routers_loss": 0.002227995079010725, "skip_count": 0.0, "step": 3748, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.158203125, "learning_rate": 0.0006528476524815529, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5477354.0, "repeat_count": 0.0, "routers_loss": 0.0013099164934828877, "skip_count": 1.0, "step": 3750, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0006524783222487167, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 5480055.0, "repeat_count": 0.0, "routers_loss": 0.0028493341524153948, "skip_count": 2.0, "step": 3752, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 21.827385287691186, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.10205078125, "learning_rate": 0.0006521089002676321, "loss": 0.0049, "macro_f1": 0.5950249433517456, "num_tokens": 5482929.0, "repeat_count": 0.0, "routers_loss": 0.01693250611424446, "skip_count": 3.0, "step": 3754, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 21.83903860160233, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0006517393867605854, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 5485756.0, "repeat_count": 1.0, "routers_loss": 0.0002758933405857533, "skip_count": 0.0, "step": 3756, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.850691915513472, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0006513697819499175, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 5488303.0, "repeat_count": 1.0, "routers_loss": 0.0021843560971319675, "skip_count": 2.0, "step": 3758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0006510000860580247, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 5490851.0, "repeat_count": 0.0, "routers_loss": 0.0007628519088029861, "skip_count": 0.0, "step": 3760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.873998543335762, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056640625, "learning_rate": 0.000650630299307358, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 5494048.0, "repeat_count": 0.0, "routers_loss": 0.0021823991555720568, "skip_count": 1.0, "step": 3762, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 21.885651857246906, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07958984375, "learning_rate": 0.000650260421920423, "loss": 0.0047, "macro_f1": 0.8839138746261597, "num_tokens": 5497441.0, "repeat_count": 1.0, "routers_loss": 0.010998859070241451, "skip_count": 2.0, "step": 3764, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0595703125, "learning_rate": 0.00064989045411978, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 5500217.0, "repeat_count": 0.0, "routers_loss": 0.004847705829888582, "skip_count": 0.0, "step": 3766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0006495203961280433, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 5502774.0, "repeat_count": 0.0, "routers_loss": 0.004135738592594862, "skip_count": 1.0, "step": 3768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11962890625, "learning_rate": 0.0006491502481678821, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5506030.0, "repeat_count": 0.0, "routers_loss": 0.004609721712768078, "skip_count": 2.0, "step": 3770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 21.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10498046875, "learning_rate": 0.0006487800104620192, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 5509153.0, "repeat_count": 0.0, "routers_loss": 0.0022236276417970657, "skip_count": 1.0, "step": 3772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 21.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1279296875, "learning_rate": 0.0006484096832332316, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5512035.0, "repeat_count": 0.0, "routers_loss": 0.010508929379284382, "skip_count": 6.0, "step": 3774, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.00064803926670435, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5514790.0, "repeat_count": 0.0, "routers_loss": 0.006369763519614935, "skip_count": 2.0, "step": 3776, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 21.967225054624908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0006476687610982591, "loss": 0.0055, "macro_f1": 1.0, "num_tokens": 5518090.0, "repeat_count": 1.0, "routers_loss": 0.006070074159651995, "skip_count": 1.0, "step": 3778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 21.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.0006472981666378974, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5520938.0, "repeat_count": 0.0, "routers_loss": 0.002784914569929242, "skip_count": 2.0, "step": 3780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 21.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1162109375, "learning_rate": 0.000646927483546256, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 5523822.0, "repeat_count": 0.0, "routers_loss": 0.003817550837993622, "skip_count": 0.0, "step": 3782, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.119140625, "learning_rate": 0.0006465567120463805, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 5526752.0, "repeat_count": 1.0, "routers_loss": 0.003909189719706774, "skip_count": 2.0, "step": 3784, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0006461858523613684, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 5529182.0, "repeat_count": 0.0, "routers_loss": 0.0007590921013616025, "skip_count": 0.0, "step": 3786, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0006458149047143714, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 5532807.0, "repeat_count": 0.0, "routers_loss": 0.0008481109980493784, "skip_count": 0.0, "step": 3788, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 22.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10302734375, "learning_rate": 0.0006454438693285935, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 5535605.0, "repeat_count": 2.0, "routers_loss": 0.008338063955307007, "skip_count": 2.0, "step": 3790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.046613255644573, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1669921875, "learning_rate": 0.0006450727464272916, "loss": 0.0068, "macro_f1": 0.32863849401474, "num_tokens": 5538181.0, "repeat_count": 1.0, "routers_loss": 0.014593596570193768, "skip_count": 0.0, "step": 3792, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0006447015362337757, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5540902.0, "repeat_count": 0.0, "routers_loss": 0.0006725982530042529, "skip_count": 0.0, "step": 3794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0791015625, "learning_rate": 0.0006443302389714074, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 5543974.0, "repeat_count": 0.0, "routers_loss": 0.0011495647486299276, "skip_count": 2.0, "step": 3796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10986328125, "learning_rate": 0.0006439588548636016, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 5546255.0, "repeat_count": 0.0, "routers_loss": 0.007603109814226627, "skip_count": 3.0, "step": 3798, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.09322651128915, "f1_execute": 0.9836065173149109, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.08251953125, "learning_rate": 0.0006435873841338249, "loss": 0.0046, "macro_f1": 0.8834244608879089, "num_tokens": 5548981.0, "repeat_count": 2.0, "routers_loss": 0.02107233554124832, "skip_count": 4.0, "step": 3800, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.104879825200292, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1875, "learning_rate": 0.0006432158270055963, "loss": 0.005, "macro_f1": 0.8837606906890869, "num_tokens": 5553719.0, "repeat_count": 2.0, "routers_loss": 0.019035672768950462, "skip_count": 2.0, "step": 3802, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.116533139111436, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0888671875, "learning_rate": 0.0006428441837024868, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 5556375.0, "repeat_count": 1.0, "routers_loss": 0.002087587723508477, "skip_count": 0.0, "step": 3804, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1650390625, "learning_rate": 0.0006424724544481189, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 5559149.0, "repeat_count": 0.0, "routers_loss": 0.0031349484343081713, "skip_count": 2.0, "step": 3806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0006421006394661675, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5562741.0, "repeat_count": 0.0, "routers_loss": 0.0005523095605894923, "skip_count": 0.0, "step": 3808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0006417287389803581, "loss": 0.003, "macro_f1": 0.3333333432674408, "num_tokens": 5566224.0, "repeat_count": 0.0, "routers_loss": 0.0003959026071242988, "skip_count": 0.0, "step": 3810, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08642578125, "learning_rate": 0.0006413567532144686, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5569137.0, "repeat_count": 0.0, "routers_loss": 0.004793370608240366, "skip_count": 2.0, "step": 3812, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.17479970866715, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0006409846823923277, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 5572148.0, "repeat_count": 1.0, "routers_loss": 0.005374136380851269, "skip_count": 0.0, "step": 3814, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.18359375, "learning_rate": 0.0006406125267378153, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5575293.0, "repeat_count": 0.0, "routers_loss": 0.0009206627728417516, "skip_count": 0.0, "step": 3816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 22.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10791015625, "learning_rate": 0.0006402402864748626, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 5577986.0, "repeat_count": 0.0, "routers_loss": 0.003562450874596834, "skip_count": 4.0, "step": 3818, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.0006398679618274515, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 5580590.0, "repeat_count": 0.0, "routers_loss": 0.0027327914722263813, "skip_count": 2.0, "step": 3820, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.221412964311725, "f1_execute": 0.9705882668495178, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1796875, "learning_rate": 0.0006394955530196147, "loss": 0.0051, "macro_f1": 0.656862735748291, "num_tokens": 5582983.0, "repeat_count": 1.0, "routers_loss": 0.04326849430799484, "skip_count": 2.0, "step": 3822, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.0006391230602754356, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 5585280.0, "repeat_count": 0.0, "routers_loss": 0.0015291986055672169, "skip_count": 1.0, "step": 3824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.21875, "learning_rate": 0.0006387504838190479, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 5587936.0, "repeat_count": 0.0, "routers_loss": 0.0009810889605432749, "skip_count": 0.0, "step": 3826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.000638377823874636, "loss": 0.0029, "macro_f1": 0.6666666865348816, "num_tokens": 5590963.0, "repeat_count": 0.0, "routers_loss": 0.0007853418937884271, "skip_count": 2.0, "step": 3828, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 22.2680262199563, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1396484375, "learning_rate": 0.0006380050806664346, "loss": 0.0058, "macro_f1": 0.9280423521995544, "num_tokens": 5593354.0, "repeat_count": 2.0, "routers_loss": 0.012884320691227913, "skip_count": 3.0, "step": 3830, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.279679533867444, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.158203125, "learning_rate": 0.0006376322544187279, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 5596571.0, "repeat_count": 1.0, "routers_loss": 0.0056452397257089615, "skip_count": 2.0, "step": 3832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08056640625, "learning_rate": 0.0006372593453558505, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 5599029.0, "repeat_count": 0.0, "routers_loss": 0.0004138312069699168, "skip_count": 2.0, "step": 3834, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.000636886353702187, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5601989.0, "repeat_count": 0.0, "routers_loss": 0.0006238518399186432, "skip_count": 0.0, "step": 3836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0006365132796821714, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 5605150.0, "repeat_count": 0.0, "routers_loss": 0.0008370034047402442, "skip_count": 0.0, "step": 3838, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.19140625, "learning_rate": 0.0006361401235202872, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5607564.0, "repeat_count": 0.0, "routers_loss": 0.000761129951570183, "skip_count": 0.0, "step": 3840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.0006357668854410675, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 5610667.0, "repeat_count": 0.0, "routers_loss": 0.0005681869224645197, "skip_count": 0.0, "step": 3842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.0006353935656690948, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 5613920.0, "repeat_count": 0.0, "routers_loss": 0.0017881476087495685, "skip_count": 2.0, "step": 3844, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.361252731245447, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09326171875, "learning_rate": 0.0006350201644290005, "loss": 0.0037, "macro_f1": 0.32863849401474, "num_tokens": 5616937.0, "repeat_count": 0.0, "routers_loss": 0.014507302083075047, "skip_count": 1.0, "step": 3846, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0006346466819454649, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 5620648.0, "repeat_count": 0.0, "routers_loss": 0.0009559004101902246, "skip_count": 0.0, "step": 3848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.0006342731184432179, "loss": 0.0067, "macro_f1": 0.3333333432674408, "num_tokens": 5623566.0, "repeat_count": 0.0, "routers_loss": 0.006162155885249376, "skip_count": 0.0, "step": 3850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.125, "learning_rate": 0.0006338994741470373, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 5626383.0, "repeat_count": 0.0, "routers_loss": 0.003218138823285699, "skip_count": 0.0, "step": 3852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11962890625, "learning_rate": 0.0006335257492817498, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 5629042.0, "repeat_count": 0.0, "routers_loss": 0.004926107823848724, "skip_count": 1.0, "step": 3854, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0947265625, "learning_rate": 0.0006331519440722311, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 5631934.0, "repeat_count": 2.0, "routers_loss": 0.013887457549571991, "skip_count": 4.0, "step": 3856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 22.43117261471231, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.07275390625, "learning_rate": 0.0006327780587434044, "loss": 0.003, "macro_f1": 0.6139194369316101, "num_tokens": 5634762.0, "repeat_count": 0.0, "routers_loss": 0.015466908924281597, "skip_count": 4.0, "step": 3858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1494140625, "learning_rate": 0.0006324040935202417, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5637681.0, "repeat_count": 0.0, "routers_loss": 0.0017096280353143811, "skip_count": 2.0, "step": 3860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 22.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.0006320300486277627, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5640919.0, "repeat_count": 0.0, "routers_loss": 0.0030768339056521654, "skip_count": 4.0, "step": 3862, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.109375, "learning_rate": 0.0006316559242910356, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 5644024.0, "repeat_count": 0.0, "routers_loss": 0.006234075874090195, "skip_count": 2.0, "step": 3864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0859375, "learning_rate": 0.0006312817207351756, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 5646544.0, "repeat_count": 0.0, "routers_loss": 0.002910739043727517, "skip_count": 0.0, "step": 3866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08056640625, "learning_rate": 0.000630907438185346, "loss": 0.0127, "macro_f1": 0.6666666865348816, "num_tokens": 5649381.0, "repeat_count": 0.0, "routers_loss": 0.002129946369677782, "skip_count": 1.0, "step": 3868, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.50109249817917, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0006305330768667581, "loss": 0.0044, "macro_f1": 0.32863849401474, "num_tokens": 5652070.0, "repeat_count": 0.0, "routers_loss": 0.005217468831688166, "skip_count": 1.0, "step": 3870, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 22.512745812090312, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08984375, "learning_rate": 0.0006301586370046695, "loss": 0.0045, "macro_f1": 0.8839138746261597, "num_tokens": 5654822.0, "repeat_count": 1.0, "routers_loss": 0.02469673752784729, "skip_count": 2.0, "step": 3872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1904296875, "learning_rate": 0.0006297841188243861, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5657333.0, "repeat_count": 0.0, "routers_loss": 0.00829730648547411, "skip_count": 2.0, "step": 3874, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1357421875, "learning_rate": 0.0006294095225512603, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5659702.0, "repeat_count": 0.0, "routers_loss": 0.0012365864822641015, "skip_count": 2.0, "step": 3876, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.547705753823745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08154296875, "learning_rate": 0.0006290348484106921, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 5661825.0, "repeat_count": 2.0, "routers_loss": 0.013589031994342804, "skip_count": 3.0, "step": 3878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11328125, "learning_rate": 0.0006286600966281273, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 5665171.0, "repeat_count": 0.0, "routers_loss": 0.0033507090993225574, "skip_count": 3.0, "step": 3880, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.57101238164603, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08642578125, "learning_rate": 0.0006282852674290595, "loss": 0.0081, "macro_f1": 0.6666666865348816, "num_tokens": 5667849.0, "repeat_count": 1.0, "routers_loss": 0.0036886115558445454, "skip_count": 0.0, "step": 3882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2578125, "learning_rate": 0.0006279103610390283, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 5671409.0, "repeat_count": 0.0, "routers_loss": 0.0011586949694901705, "skip_count": 2.0, "step": 3884, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.594319009468318, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12890625, "learning_rate": 0.0006275353776836199, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 5674399.0, "repeat_count": 3.0, "routers_loss": 0.012698530219495296, "skip_count": 4.0, "step": 3886, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.60597232337946, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.125, "learning_rate": 0.000627160317588467, "loss": 0.0058, "macro_f1": 0.661835789680481, "num_tokens": 5677504.0, "repeat_count": 1.0, "routers_loss": 0.01836787723004818, "skip_count": 1.0, "step": 3888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.000626785180979248, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 5680775.0, "repeat_count": 0.0, "routers_loss": 0.0014472103212028742, "skip_count": 0.0, "step": 3890, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1181640625, "learning_rate": 0.0006264099680816876, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 5683616.0, "repeat_count": 0.0, "routers_loss": 0.00823501218110323, "skip_count": 2.0, "step": 3892, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.000626034679121557, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5686623.0, "repeat_count": 0.0, "routers_loss": 0.0023378776386380196, "skip_count": 0.0, "step": 3894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0006256593143246718, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 5689191.0, "repeat_count": 0.0, "routers_loss": 0.0016715035308152437, "skip_count": 0.0, "step": 3896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.664238892935177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.205078125, "learning_rate": 0.0006252838739168945, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5692045.0, "repeat_count": 0.0, "routers_loss": 0.00042784950346685946, "skip_count": 0.0, "step": 3898, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.67589220684632, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0006249083581241324, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 5694853.0, "repeat_count": 1.0, "routers_loss": 0.003171514021232724, "skip_count": 0.0, "step": 3900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10302734375, "learning_rate": 0.0006245327671723387, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 5698162.0, "repeat_count": 0.0, "routers_loss": 0.0032328921370208263, "skip_count": 3.0, "step": 3902, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0947265625, "learning_rate": 0.0006241571012875111, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 5701036.0, "repeat_count": 0.0, "routers_loss": 0.002484745578840375, "skip_count": 0.0, "step": 3904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.000623781360695693, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 5704014.0, "repeat_count": 0.0, "routers_loss": 0.003416304476559162, "skip_count": 1.0, "step": 3906, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.722505462490897, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10791015625, "learning_rate": 0.0006234055456229729, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 5706995.0, "repeat_count": 1.0, "routers_loss": 0.005278313998132944, "skip_count": 2.0, "step": 3908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.053466796875, "learning_rate": 0.0006230296562954829, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5709716.0, "repeat_count": 0.0, "routers_loss": 0.0006246064440347254, "skip_count": 0.0, "step": 3910, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 22.745812090313184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0006226536929394013, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 5713096.0, "repeat_count": 2.0, "routers_loss": 0.008745948784053326, "skip_count": 1.0, "step": 3912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.054443359375, "learning_rate": 0.0006222776557809502, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 5715971.0, "repeat_count": 0.0, "routers_loss": 0.0015972115797922015, "skip_count": 0.0, "step": 3914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.000621901545046396, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 5719265.0, "repeat_count": 0.0, "routers_loss": 0.0010060627246275544, "skip_count": 1.0, "step": 3916, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.780772032046613, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0006215253609620498, "loss": 0.0051, "macro_f1": 0.6615384817123413, "num_tokens": 5722176.0, "repeat_count": 1.0, "routers_loss": 0.03142716735601425, "skip_count": 3.0, "step": 3918, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.792425345957756, "f1_execute": 0.9836065173149109, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.1630859375, "learning_rate": 0.0006211491037542664, "loss": 0.008, "macro_f1": 0.8834244608879089, "num_tokens": 5724624.0, "repeat_count": 2.0, "routers_loss": 0.02315090037882328, "skip_count": 4.0, "step": 3920, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.134765625, "learning_rate": 0.0006207727736494452, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 5727231.0, "repeat_count": 0.0, "routers_loss": 0.004741814453154802, "skip_count": 1.0, "step": 3922, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.26953125, "learning_rate": 0.0006203963708740284, "loss": 0.0078, "macro_f1": 0.6666666865348816, "num_tokens": 5730912.0, "repeat_count": 0.0, "routers_loss": 0.007773386314511299, "skip_count": 3.0, "step": 3924, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 22.827385287691186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08935546875, "learning_rate": 0.000620019895654503, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 5733952.0, "repeat_count": 1.0, "routers_loss": 0.006144384853541851, "skip_count": 1.0, "step": 3926, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 22.83903860160233, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.10791015625, "learning_rate": 0.0006196433482173992, "loss": 0.0049, "macro_f1": 0.5507246255874634, "num_tokens": 5736659.0, "repeat_count": 0.0, "routers_loss": 0.007985481061041355, "skip_count": 2.0, "step": 3928, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.850691915513472, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0791015625, "learning_rate": 0.0006192667287892905, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 5739147.0, "repeat_count": 1.0, "routers_loss": 0.0009132990962825716, "skip_count": 0.0, "step": 3930, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 22.86234522942462, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0625, "learning_rate": 0.0006188900375967938, "loss": 0.0058, "macro_f1": 0.928205132484436, "num_tokens": 5741835.0, "repeat_count": 1.0, "routers_loss": 0.012550560757517815, "skip_count": 3.0, "step": 3932, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.873998543335762, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11474609375, "learning_rate": 0.0006185132748665695, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 5744281.0, "repeat_count": 0.0, "routers_loss": 0.0017014549812301993, "skip_count": 0.0, "step": 3934, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 22.885651857246906, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0006181364408253208, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5747792.0, "repeat_count": 1.0, "routers_loss": 0.0014552574139088392, "skip_count": 0.0, "step": 3936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11181640625, "learning_rate": 0.0006177595356997934, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5751696.0, "repeat_count": 0.0, "routers_loss": 0.0027683009393513203, "skip_count": 0.0, "step": 3938, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.908958485069192, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.0006173825597167763, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 5754510.0, "repeat_count": 1.0, "routers_loss": 0.0018469853093847632, "skip_count": 4.0, "step": 3940, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 22.920611798980335, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1474609375, "learning_rate": 0.0006170055131031011, "loss": 0.006, "macro_f1": 0.5507246255874634, "num_tokens": 5756792.0, "repeat_count": 0.0, "routers_loss": 0.024953117594122887, "skip_count": 2.0, "step": 3942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 22.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07861328125, "learning_rate": 0.000616628396085642, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5759980.0, "repeat_count": 0.0, "routers_loss": 0.0006410024943761528, "skip_count": 1.0, "step": 3944, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 22.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0006162512088913149, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5762832.0, "repeat_count": 0.0, "routers_loss": 0.005713435355573893, "skip_count": 0.0, "step": 3946, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 22.955571740713765, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.083984375, "learning_rate": 0.0006158739517470786, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 5766584.0, "repeat_count": 1.0, "routers_loss": 0.0048615820705890656, "skip_count": 1.0, "step": 3948, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 22.967225054624908, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0006154966248799339, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5770028.0, "repeat_count": 2.0, "routers_loss": 0.00237431307323277, "skip_count": 0.0, "step": 3950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 22.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1552734375, "learning_rate": 0.0006151192285169235, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 5772669.0, "repeat_count": 0.0, "routers_loss": 0.005005496088415384, "skip_count": 3.0, "step": 3952, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 22.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06298828125, "learning_rate": 0.0006147417628851314, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5775551.0, "repeat_count": 0.0, "routers_loss": 0.0036195472348481417, "skip_count": 2.0, "step": 3954, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1357421875, "learning_rate": 0.000614364228211684, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 5777968.0, "repeat_count": 1.0, "routers_loss": 0.011829965747892857, "skip_count": 4.0, "step": 3956, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.056884765625, "learning_rate": 0.000613986624723749, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 5780791.0, "repeat_count": 0.0, "routers_loss": 0.008142031729221344, "skip_count": 2.0, "step": 3958, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.023306627822286, "f1_execute": 0.9836065173149109, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.0006136089526485355, "loss": 0.0045, "macro_f1": 0.8834244608879089, "num_tokens": 5784311.0, "repeat_count": 2.0, "routers_loss": 0.029501643031835556, "skip_count": 4.0, "step": 3960, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10107421875, "learning_rate": 0.0006132312122132937, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 5787124.0, "repeat_count": 1.0, "routers_loss": 0.0018721094820648432, "skip_count": 3.0, "step": 3962, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0006128534036453153, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 5789764.0, "repeat_count": 0.0, "routers_loss": 0.0003642186929937452, "skip_count": 0.0, "step": 3964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.058266569555716, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.25390625, "learning_rate": 0.0006124755271719325, "loss": 0.0072, "macro_f1": 0.32863849401474, "num_tokens": 5792246.0, "repeat_count": 0.0, "routers_loss": 0.009288460947573185, "skip_count": 1.0, "step": 3966, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.06991988346686, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07666015625, "learning_rate": 0.0006120975830205188, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 5794763.0, "repeat_count": 1.0, "routers_loss": 0.004904754925519228, "skip_count": 3.0, "step": 3968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07470703125, "learning_rate": 0.0006117195714184883, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5798109.0, "repeat_count": 0.0, "routers_loss": 0.0008115784730762243, "skip_count": 0.0, "step": 3970, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.09322651128915, "f1_execute": 0.9841269850730896, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0006113414925932955, "loss": 0.0046, "macro_f1": 0.8835979700088501, "num_tokens": 5800599.0, "repeat_count": 2.0, "routers_loss": 0.03376098722219467, "skip_count": 3.0, "step": 3972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1162109375, "learning_rate": 0.0006109633467724356, "loss": 0.0121, "macro_f1": 0.6666666865348816, "num_tokens": 5803488.0, "repeat_count": 0.0, "routers_loss": 0.001884973724372685, "skip_count": 3.0, "step": 3974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.0006105851341834438, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 5806247.0, "repeat_count": 0.0, "routers_loss": 0.003094443352892995, "skip_count": 0.0, "step": 3976, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1083984375, "learning_rate": 0.0006102068550538961, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 5809063.0, "repeat_count": 0.0, "routers_loss": 0.000867915281560272, "skip_count": 1.0, "step": 3978, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 23.139839766933722, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1025390625, "learning_rate": 0.0006098285096114077, "loss": 0.0055, "macro_f1": 0.9280423521995544, "num_tokens": 5811786.0, "repeat_count": 2.0, "routers_loss": 0.020774055272340775, "skip_count": 3.0, "step": 3980, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 23.151493080844865, "f1_execute": 0.9850746393203735, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0006094500980836344, "loss": 0.0035, "macro_f1": 0.5950249433517456, "num_tokens": 5815201.0, "repeat_count": 3.0, "routers_loss": 0.02540404722094536, "skip_count": 0.0, "step": 3982, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.16314639475601, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.0006090716206982714, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 5818149.0, "repeat_count": 1.0, "routers_loss": 0.0035062183160334826, "skip_count": 3.0, "step": 3984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0006086930776830533, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 5821063.0, "repeat_count": 0.0, "routers_loss": 0.006513547617942095, "skip_count": 0.0, "step": 3986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.12109375, "learning_rate": 0.000608314469265755, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 5823412.0, "repeat_count": 0.0, "routers_loss": 0.0010807780781760812, "skip_count": 0.0, "step": 3988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 23.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.125, "learning_rate": 0.00060793579567419, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5826327.0, "repeat_count": 0.0, "routers_loss": 0.00868023931980133, "skip_count": 5.0, "step": 3990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0006075570571362112, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 5829134.0, "repeat_count": 0.0, "routers_loss": 0.0040139611810445786, "skip_count": 2.0, "step": 3992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.181640625, "learning_rate": 0.0006071782538797112, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 5831790.0, "repeat_count": 0.0, "routers_loss": 0.001818748190999031, "skip_count": 0.0, "step": 3994, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1064453125, "learning_rate": 0.0006067993861326201, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5834691.0, "repeat_count": 0.0, "routers_loss": 0.0018087580101564527, "skip_count": 3.0, "step": 3996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1650390625, "learning_rate": 0.0006064204541229082, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 5837544.0, "repeat_count": 0.0, "routers_loss": 0.0007218173122964799, "skip_count": 0.0, "step": 3998, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08349609375, "learning_rate": 0.000606041458078584, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5840394.0, "repeat_count": 0.0, "routers_loss": 0.006723688915371895, "skip_count": 1.0, "step": 4000, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 23.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2451171875, "learning_rate": 0.0006056623982276944, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 5842857.0, "repeat_count": 1.0, "routers_loss": 0.002316560596227646, "skip_count": 5.0, "step": 4002, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.0006052832747983247, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 5845552.0, "repeat_count": 0.0, "routers_loss": 0.0026433479506522417, "skip_count": 0.0, "step": 4004, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 23.291332847778587, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11083984375, "learning_rate": 0.0006049040880185987, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5848358.0, "repeat_count": 1.0, "routers_loss": 0.009392415173351765, "skip_count": 0.0, "step": 4006, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 23.30298616168973, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1416015625, "learning_rate": 0.0006045248381166782, "loss": 0.0067, "macro_f1": 0.5950249433517456, "num_tokens": 5851289.0, "repeat_count": 0.0, "routers_loss": 0.012923277914524078, "skip_count": 3.0, "step": 4008, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056396484375, "learning_rate": 0.0006041455253207627, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5854191.0, "repeat_count": 0.0, "routers_loss": 0.005142854526638985, "skip_count": 0.0, "step": 4010, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.326292789512017, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08251953125, "learning_rate": 0.00060376614985909, "loss": 0.0034, "macro_f1": 1.0, "num_tokens": 5857570.0, "repeat_count": 1.0, "routers_loss": 0.0037508036475628614, "skip_count": 2.0, "step": 4012, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 23.33794610342316, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.27734375, "learning_rate": 0.0006033867119599354, "loss": 0.0055, "macro_f1": 0.5950249433517456, "num_tokens": 5860575.0, "repeat_count": 0.0, "routers_loss": 0.04123789072036743, "skip_count": 3.0, "step": 4014, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 23.349599417334304, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.2197265625, "learning_rate": 0.0006030072118516116, "loss": 0.0054, "macro_f1": 0.5507246255874634, "num_tokens": 5863525.0, "repeat_count": 0.0, "routers_loss": 0.008998253382742405, "skip_count": 2.0, "step": 4016, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10791015625, "learning_rate": 0.000602627649762469, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 5867173.0, "repeat_count": 0.0, "routers_loss": 0.0007664313889108598, "skip_count": 0.0, "step": 4018, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1875, "learning_rate": 0.0006022480259208951, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5870272.0, "repeat_count": 0.0, "routers_loss": 0.005751076154410839, "skip_count": 2.0, "step": 4020, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0703125, "learning_rate": 0.000601868340555315, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 5873607.0, "repeat_count": 0.0, "routers_loss": 0.0009048236533999443, "skip_count": 0.0, "step": 4022, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.00060148859389419, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 5876286.0, "repeat_count": 0.0, "routers_loss": 0.002030286705121398, "skip_count": 0.0, "step": 4024, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.000601108786166019, "loss": 0.0061, "macro_f1": 0.3333333432674408, "num_tokens": 5878960.0, "repeat_count": 0.0, "routers_loss": 0.0008742345962673426, "skip_count": 0.0, "step": 4026, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.419519300801166, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0006007289175993374, "loss": 0.003, "macro_f1": 0.6666666865348816, "num_tokens": 5881647.0, "repeat_count": 0.0, "routers_loss": 0.0026560418773442507, "skip_count": 1.0, "step": 4028, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.000600348988422717, "loss": 0.003, "macro_f1": 0.3333333432674408, "num_tokens": 5884273.0, "repeat_count": 0.0, "routers_loss": 0.002993235597386956, "skip_count": 0.0, "step": 4030, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 23.442825928623453, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1162109375, "learning_rate": 0.0005999689988647666, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5886977.0, "repeat_count": 1.0, "routers_loss": 0.0036973324604332447, "skip_count": 0.0, "step": 4032, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1279296875, "learning_rate": 0.0005995889491541308, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 5889336.0, "repeat_count": 0.0, "routers_loss": 0.0018305482808500528, "skip_count": 2.0, "step": 4034, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12109375, "learning_rate": 0.0005992088395194907, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 5892196.0, "repeat_count": 0.0, "routers_loss": 0.007992503233253956, "skip_count": 2.0, "step": 4036, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.201171875, "learning_rate": 0.0005988286701895631, "loss": 0.0064, "macro_f1": 0.3333333432674408, "num_tokens": 5895158.0, "repeat_count": 0.0, "routers_loss": 0.0010173144983127713, "skip_count": 0.0, "step": 4038, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.15234375, "learning_rate": 0.0005984484413931013, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 5897959.0, "repeat_count": 0.0, "routers_loss": 0.002448364393785596, "skip_count": 2.0, "step": 4040, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09912109375, "learning_rate": 0.0005980681533588938, "loss": 0.0028, "macro_f1": 0.6666666865348816, "num_tokens": 5900445.0, "repeat_count": 0.0, "routers_loss": 0.004141320940107107, "skip_count": 1.0, "step": 4042, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0005976878063157652, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 5903767.0, "repeat_count": 0.0, "routers_loss": 0.00419309176504612, "skip_count": 0.0, "step": 4044, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1533203125, "learning_rate": 0.0005973074004925755, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 5906691.0, "repeat_count": 0.0, "routers_loss": 0.0008423164254054427, "skip_count": 0.0, "step": 4046, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.04833984375, "learning_rate": 0.0005969269361182197, "loss": 0.0035, "macro_f1": 1.0, "num_tokens": 5909934.0, "repeat_count": 3.0, "routers_loss": 0.003494736272841692, "skip_count": 6.0, "step": 4048, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.547705753823745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07568359375, "learning_rate": 0.0005965464134216284, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 5913204.0, "repeat_count": 1.0, "routers_loss": 0.0032704919576644897, "skip_count": 2.0, "step": 4050, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.109375, "learning_rate": 0.0005961658326317674, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 5916374.0, "repeat_count": 0.0, "routers_loss": 0.007182225584983826, "skip_count": 2.0, "step": 4052, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08349609375, "learning_rate": 0.0005957851939776368, "loss": 0.0075, "macro_f1": 0.3333333432674408, "num_tokens": 5919335.0, "repeat_count": 0.0, "routers_loss": 0.0024788687005639076, "skip_count": 0.0, "step": 4054, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061767578125, "learning_rate": 0.0005954044976882724, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 5923107.0, "repeat_count": 0.0, "routers_loss": 0.0029900320805609226, "skip_count": 2.0, "step": 4056, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1884765625, "learning_rate": 0.0005950237439927441, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 5926307.0, "repeat_count": 0.0, "routers_loss": 0.005798702128231525, "skip_count": 2.0, "step": 4058, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0005946429331201566, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 5929021.0, "repeat_count": 0.0, "routers_loss": 0.000783930707257241, "skip_count": 0.0, "step": 4060, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 23.617625637290605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0005942620652996487, "loss": 0.0034, "macro_f1": 0.6666666865348816, "num_tokens": 5931508.0, "repeat_count": 1.0, "routers_loss": 0.0023568086326122284, "skip_count": 0.0, "step": 4062, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.20703125, "learning_rate": 0.0005938811407603939, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 5934639.0, "repeat_count": 0.0, "routers_loss": 0.002283278154209256, "skip_count": 0.0, "step": 4064, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.0005935001597315995, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 5937636.0, "repeat_count": 0.0, "routers_loss": 0.000495585729368031, "skip_count": 0.0, "step": 4066, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1357421875, "learning_rate": 0.0005931191224425068, "loss": 0.0069, "macro_f1": 0.3333333432674408, "num_tokens": 5940955.0, "repeat_count": 0.0, "routers_loss": 0.005762206390500069, "skip_count": 0.0, "step": 4068, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 23.664238892935177, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.166015625, "learning_rate": 0.0005927380291223911, "loss": 0.0056, "macro_f1": 0.4901960790157318, "num_tokens": 5944881.0, "repeat_count": 0.0, "routers_loss": 0.02430931106209755, "skip_count": 3.0, "step": 4070, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.67589220684632, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.080078125, "learning_rate": 0.0005923568800005613, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 5947617.0, "repeat_count": 1.0, "routers_loss": 0.006138184107840061, "skip_count": 4.0, "step": 4072, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08154296875, "learning_rate": 0.0005919756753063601, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 5950497.0, "repeat_count": 0.0, "routers_loss": 0.0004989682929590344, "skip_count": 0.0, "step": 4074, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1435546875, "learning_rate": 0.0005915944152691633, "loss": 0.0073, "macro_f1": 0.6666666865348816, "num_tokens": 5954946.0, "repeat_count": 0.0, "routers_loss": 0.0061099957674741745, "skip_count": 1.0, "step": 4076, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10107421875, "learning_rate": 0.0005912131001183803, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 5957790.0, "repeat_count": 0.0, "routers_loss": 0.0025855700951069593, "skip_count": 2.0, "step": 4078, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07275390625, "learning_rate": 0.0005908317300834534, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 5960627.0, "repeat_count": 0.0, "routers_loss": 0.0022625159472227097, "skip_count": 1.0, "step": 4080, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 23.73415877640204, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0005904503053938583, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 5963998.0, "repeat_count": 1.0, "routers_loss": 0.002250903518870473, "skip_count": 0.0, "step": 4082, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 23.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12158203125, "learning_rate": 0.000590068826279103, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 5966725.0, "repeat_count": 0.0, "routers_loss": 0.0017974289366975427, "skip_count": 2.0, "step": 4084, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 23.757465404224327, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09130859375, "learning_rate": 0.0005896872929687287, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 5970238.0, "repeat_count": 2.0, "routers_loss": 0.00864106509834528, "skip_count": 2.0, "step": 4086, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0005893057056923093, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 5972924.0, "repeat_count": 0.0, "routers_loss": 0.00045281523489393294, "skip_count": 0.0, "step": 4088, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 36.0, "epoch": 23.780772032046613, "f1_execute": 0.9850746393203735, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.07421875, "learning_rate": 0.0005889240646794507, "loss": 0.0044, "macro_f1": 0.8839138746261597, "num_tokens": 5976197.0, "repeat_count": 1.0, "routers_loss": 0.007443464361131191, "skip_count": 2.0, "step": 4090, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08447265625, "learning_rate": 0.0005885423701597918, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 5978849.0, "repeat_count": 0.0, "routers_loss": 0.004947007168084383, "skip_count": 1.0, "step": 4092, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 23.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11767578125, "learning_rate": 0.0005881606223630028, "loss": 0.0074, "macro_f1": 0.6666666865348816, "num_tokens": 5981784.0, "repeat_count": 0.0, "routers_loss": 0.004844254814088345, "skip_count": 4.0, "step": 4094, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 23.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0005877788215187867, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 5984447.0, "repeat_count": 0.0, "routers_loss": 0.006664087064564228, "skip_count": 3.0, "step": 4096, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.827385287691186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2060546875, "learning_rate": 0.0005873969678568783, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 5987194.0, "repeat_count": 0.0, "routers_loss": 0.0031506677623838186, "skip_count": 0.0, "step": 4098, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07763671875, "learning_rate": 0.000587015061607044, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 5989959.0, "repeat_count": 0.0, "routers_loss": 0.0019086742540821433, "skip_count": 0.0, "step": 4100, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.850691915513472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1708984375, "learning_rate": 0.0005866331029990818, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 5992513.0, "repeat_count": 0.0, "routers_loss": 0.0026433677412569523, "skip_count": 0.0, "step": 4102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 23.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.0005862510922628213, "loss": 0.0072, "macro_f1": 0.6666666865348816, "num_tokens": 5995926.0, "repeat_count": 0.0, "routers_loss": 0.005440241657197475, "skip_count": 1.0, "step": 4104, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 23.873998543335762, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0005858690296281235, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 5998894.0, "repeat_count": 2.0, "routers_loss": 0.003370592137798667, "skip_count": 0.0, "step": 4106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.885651857246906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.000585486915324881, "loss": 0.0073, "macro_f1": 0.3333333432674408, "num_tokens": 6001406.0, "repeat_count": 0.0, "routers_loss": 0.0006028945208527148, "skip_count": 0.0, "step": 4108, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 23.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.0005851047495830163, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6004785.0, "repeat_count": 0.0, "routers_loss": 0.009675469249486923, "skip_count": 4.0, "step": 4110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11767578125, "learning_rate": 0.0005847225326324843, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6007295.0, "repeat_count": 0.0, "routers_loss": 0.004869884345680475, "skip_count": 0.0, "step": 4112, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1376953125, "learning_rate": 0.0005843402647032699, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6009388.0, "repeat_count": 0.0, "routers_loss": 0.00043720597750507295, "skip_count": 0.0, "step": 4114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 34.0, "epoch": 23.93226511289148, "f1_execute": 0.9696969985961914, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.08544921875, "learning_rate": 0.0005839579460253886, "loss": 0.0074, "macro_f1": 0.5454546213150024, "num_tokens": 6012198.0, "repeat_count": 0.0, "routers_loss": 0.030817938968539238, "skip_count": 4.0, "step": 4116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1220703125, "learning_rate": 0.0005835755768288869, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 6015959.0, "repeat_count": 0.0, "routers_loss": 0.0006276617059484124, "skip_count": 0.0, "step": 4118, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 35.0, "epoch": 23.955571740713765, "f1_execute": 0.9705882668495178, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.08740234375, "learning_rate": 0.0005831931573438414, "loss": 0.0054, "macro_f1": 0.4901960790157318, "num_tokens": 6018748.0, "repeat_count": 0.0, "routers_loss": 0.0177327748388052, "skip_count": 3.0, "step": 4120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.967225054624908, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07861328125, "learning_rate": 0.0005828106878003592, "loss": 0.0048, "macro_f1": 0.32863849401474, "num_tokens": 6021419.0, "repeat_count": 0.0, "routers_loss": 0.009365832433104515, "skip_count": 1.0, "step": 4122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0908203125, "learning_rate": 0.000582428168428577, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6024385.0, "repeat_count": 0.0, "routers_loss": 0.004596297163516283, "skip_count": 0.0, "step": 4124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 23.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1416015625, "learning_rate": 0.0005820455994586621, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6027140.0, "repeat_count": 0.0, "routers_loss": 0.0014652730897068977, "skip_count": 0.0, "step": 4126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.0005816629811208112, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6029184.0, "repeat_count": 0.0, "routers_loss": 0.005553579423576593, "skip_count": 1.0, "step": 4128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 24.011653313911143, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.291015625, "learning_rate": 0.0005812803136452509, "loss": 0.0054, "macro_f1": 0.5507246255874634, "num_tokens": 6031775.0, "repeat_count": 0.0, "routers_loss": 0.02859097346663475, "skip_count": 2.0, "step": 4130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 24.023306627822286, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1806640625, "learning_rate": 0.0005808975972622375, "loss": 0.0052, "macro_f1": 0.5507246255874634, "num_tokens": 6034161.0, "repeat_count": 0.0, "routers_loss": 0.00840715877711773, "skip_count": 2.0, "step": 4132, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.26953125, "learning_rate": 0.0005805148322020565, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6036774.0, "repeat_count": 2.0, "routers_loss": 0.0020257646683603525, "skip_count": 4.0, "step": 4134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.046613255644573, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0005801320186950229, "loss": 0.0039, "macro_f1": 0.6616915464401245, "num_tokens": 6039446.0, "repeat_count": 1.0, "routers_loss": 0.026752406731247902, "skip_count": 2.0, "step": 4136, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 24.058266569555716, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.062255859375, "learning_rate": 0.0005797491569714803, "loss": 0.0028, "macro_f1": 0.6666666865348816, "num_tokens": 6043399.0, "repeat_count": 1.0, "routers_loss": 0.00023761657939758152, "skip_count": 0.0, "step": 4138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.000579366247261802, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 6046172.0, "repeat_count": 0.0, "routers_loss": 0.0016283976146951318, "skip_count": 0.0, "step": 4140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1103515625, "learning_rate": 0.0005789832897963899, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6049022.0, "repeat_count": 0.0, "routers_loss": 0.0014563349541276693, "skip_count": 0.0, "step": 4142, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09521484375, "learning_rate": 0.0005786002848056746, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 6052161.0, "repeat_count": 0.0, "routers_loss": 0.0005975402309559286, "skip_count": 0.0, "step": 4144, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.104879825200292, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0005782172325201155, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6054893.0, "repeat_count": 1.0, "routers_loss": 0.0023097970988601446, "skip_count": 2.0, "step": 4146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0005778341331702, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6058191.0, "repeat_count": 0.0, "routers_loss": 0.0035230624489486217, "skip_count": 1.0, "step": 4148, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 24.12818645302258, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1328125, "learning_rate": 0.0005774509869864443, "loss": 0.0057, "macro_f1": 0.9280423521995544, "num_tokens": 6061228.0, "repeat_count": 2.0, "routers_loss": 0.0073737651109695435, "skip_count": 3.0, "step": 4150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1328125, "learning_rate": 0.0005770677941993925, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6064487.0, "repeat_count": 0.0, "routers_loss": 0.0027577814180403948, "skip_count": 0.0, "step": 4152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.0005766845550396169, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6067812.0, "repeat_count": 0.0, "routers_loss": 0.012239047326147556, "skip_count": 2.0, "step": 4154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 24.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10009765625, "learning_rate": 0.0005763012697377176, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6070432.0, "repeat_count": 0.0, "routers_loss": 0.002428908832371235, "skip_count": 3.0, "step": 4156, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.0005759179385243224, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6074430.0, "repeat_count": 0.0, "routers_loss": 0.0033553587272763252, "skip_count": 0.0, "step": 4158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.000575534561630087, "loss": 0.0074, "macro_f1": 0.3333333432674408, "num_tokens": 6077283.0, "repeat_count": 0.0, "routers_loss": 0.0015582222258672118, "skip_count": 0.0, "step": 4160, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 24.198106336489438, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0005751511392856943, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6080602.0, "repeat_count": 1.0, "routers_loss": 0.0022373117972165346, "skip_count": 0.0, "step": 4162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.15234375, "learning_rate": 0.0005747676717218548, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 6083992.0, "repeat_count": 0.0, "routers_loss": 0.003193996846675873, "skip_count": 0.0, "step": 4164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11328125, "learning_rate": 0.0005743841591693059, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 6086628.0, "repeat_count": 0.0, "routers_loss": 0.0012532769469544291, "skip_count": 2.0, "step": 4166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.044921875, "learning_rate": 0.0005740006018588123, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6089348.0, "repeat_count": 0.0, "routers_loss": 0.0012354545760899782, "skip_count": 1.0, "step": 4168, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 24.244719592134015, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1416015625, "learning_rate": 0.0005736170000211656, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 6092022.0, "repeat_count": 1.0, "routers_loss": 0.006958292797207832, "skip_count": 4.0, "step": 4170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0005732333538871841, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6094606.0, "repeat_count": 0.0, "routers_loss": 0.004888501483947039, "skip_count": 2.0, "step": 4172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0005728496636877128, "loss": 0.0117, "macro_f1": 0.6666666865348816, "num_tokens": 6097451.0, "repeat_count": 0.0, "routers_loss": 0.002297621686011553, "skip_count": 1.0, "step": 4174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1171875, "learning_rate": 0.0005724659296536234, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6100147.0, "repeat_count": 0.0, "routers_loss": 0.004589734133332968, "skip_count": 0.0, "step": 4176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 24.291332847778587, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1845703125, "learning_rate": 0.0005720821520158137, "loss": 0.0066, "macro_f1": 0.5507246255874634, "num_tokens": 6103224.0, "repeat_count": 0.0, "routers_loss": 0.00849376805126667, "skip_count": 2.0, "step": 4178, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0005716983310052078, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6106520.0, "repeat_count": 0.0, "routers_loss": 0.0011285687796771526, "skip_count": 2.0, "step": 4180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.314639475600874, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.2119140625, "learning_rate": 0.0005713144668527558, "loss": 0.0065, "macro_f1": 0.32863849401474, "num_tokens": 6109109.0, "repeat_count": 0.0, "routers_loss": 0.00809546373784542, "skip_count": 1.0, "step": 4182, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 24.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1484375, "learning_rate": 0.0005709305597894343, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6111911.0, "repeat_count": 0.0, "routers_loss": 0.015268114395439625, "skip_count": 3.0, "step": 4184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.33794610342316, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1142578125, "learning_rate": 0.000570546610046245, "loss": 0.0057, "macro_f1": 0.5507246255874634, "num_tokens": 6114820.0, "repeat_count": 0.0, "routers_loss": 0.00706381956115365, "skip_count": 1.0, "step": 4186, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07080078125, "learning_rate": 0.0005701626178542157, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6118454.0, "repeat_count": 0.0, "routers_loss": 0.002111377427354455, "skip_count": 0.0, "step": 4188, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0005697785834443997, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6121898.0, "repeat_count": 0.0, "routers_loss": 0.000836490944493562, "skip_count": 0.0, "step": 4190, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 24.37290604515659, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.111328125, "learning_rate": 0.0005693945070478757, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6125073.0, "repeat_count": 1.0, "routers_loss": 0.0028079887852072716, "skip_count": 0.0, "step": 4192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11376953125, "learning_rate": 0.0005690103888957473, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 6128031.0, "repeat_count": 0.0, "routers_loss": 0.0013389994855970144, "skip_count": 2.0, "step": 4194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07568359375, "learning_rate": 0.0005686262292191438, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 6130749.0, "repeat_count": 0.0, "routers_loss": 0.0010625157738104463, "skip_count": 1.0, "step": 4196, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 24.407865986890023, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.0005682420282492192, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 6133426.0, "repeat_count": 1.0, "routers_loss": 0.004137418232858181, "skip_count": 0.0, "step": 4198, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 24.419519300801166, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0005678577862171522, "loss": 0.0068, "macro_f1": 0.6666666865348816, "num_tokens": 6136173.0, "repeat_count": 0.0, "routers_loss": 0.0047592888586223125, "skip_count": 3.0, "step": 4200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0005674735033541468, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 6139174.0, "repeat_count": 0.0, "routers_loss": 0.006027603521943092, "skip_count": 0.0, "step": 4202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.0005670891798914305, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6141560.0, "repeat_count": 0.0, "routers_loss": 0.0007967035635374486, "skip_count": 0.0, "step": 4204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0005667048160602564, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 6143891.0, "repeat_count": 0.0, "routers_loss": 0.0007010282133705914, "skip_count": 0.0, "step": 4206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0005663204120919012, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6146934.0, "repeat_count": 0.0, "routers_loss": 0.000740555755328387, "skip_count": 0.0, "step": 4208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0005659359682176659, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6149601.0, "repeat_count": 0.0, "routers_loss": 0.0010617696680128574, "skip_count": 0.0, "step": 4210, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 24.489439184268026, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.0005655514846688755, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6152147.0, "repeat_count": 1.0, "routers_loss": 0.003334556706249714, "skip_count": 0.0, "step": 4212, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.50109249817917, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10205078125, "learning_rate": 0.0005651669616768788, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6155276.0, "repeat_count": 2.0, "routers_loss": 0.002633491763845086, "skip_count": 2.0, "step": 4214, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0005647823994730489, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6158907.0, "repeat_count": 0.0, "routers_loss": 0.002782978117465973, "skip_count": 1.0, "step": 4216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 24.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.0005643977982887814, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 6161523.0, "repeat_count": 0.0, "routers_loss": 0.005236674100160599, "skip_count": 3.0, "step": 4218, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0654296875, "learning_rate": 0.0005640131583554964, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 6165125.0, "repeat_count": 2.0, "routers_loss": 0.002421734156087041, "skip_count": 3.0, "step": 4220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12353515625, "learning_rate": 0.0005636284799046368, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6167975.0, "repeat_count": 0.0, "routers_loss": 0.0022071092389523983, "skip_count": 1.0, "step": 4222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0005632437631676687, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6170652.0, "repeat_count": 0.0, "routers_loss": 0.0007290175417438149, "skip_count": 0.0, "step": 4224, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0005628590083760814, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 6173338.0, "repeat_count": 0.0, "routers_loss": 0.002211257815361023, "skip_count": 0.0, "step": 4226, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 24.582665695557175, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.0005624742157613869, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6176139.0, "repeat_count": 2.0, "routers_loss": 0.0016124887624755502, "skip_count": 0.0, "step": 4228, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.594319009468318, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08349609375, "learning_rate": 0.0005620893855551203, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 6178453.0, "repeat_count": 1.0, "routers_loss": 0.00777839869260788, "skip_count": 1.0, "step": 4230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08935546875, "learning_rate": 0.0005617045179888388, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 6181694.0, "repeat_count": 0.0, "routers_loss": 0.008473426103591919, "skip_count": 2.0, "step": 4232, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 24.617625637290605, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.123046875, "learning_rate": 0.0005613196132941225, "loss": 0.0039, "macro_f1": 0.6139194369316101, "num_tokens": 6184276.0, "repeat_count": 0.0, "routers_loss": 0.01436393428593874, "skip_count": 4.0, "step": 4234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06689453125, "learning_rate": 0.0005609346717025737, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6187320.0, "repeat_count": 0.0, "routers_loss": 0.002655827673152089, "skip_count": 1.0, "step": 4236, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.64093226511289, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.0005605496934458171, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 6191127.0, "repeat_count": 1.0, "routers_loss": 0.008637362159788609, "skip_count": 1.0, "step": 4238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1435546875, "learning_rate": 0.000560164678755499, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6194291.0, "repeat_count": 0.0, "routers_loss": 0.001989758340641856, "skip_count": 0.0, "step": 4240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.664238892935177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.25390625, "learning_rate": 0.0005597796278632879, "loss": 0.0098, "macro_f1": 0.6666666865348816, "num_tokens": 6197136.0, "repeat_count": 0.0, "routers_loss": 0.0024501606822013855, "skip_count": 2.0, "step": 4242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08642578125, "learning_rate": 0.0005593945410008741, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6200145.0, "repeat_count": 0.0, "routers_loss": 0.003672860562801361, "skip_count": 0.0, "step": 4244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.0005590094183999698, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 6203138.0, "repeat_count": 0.0, "routers_loss": 0.002702688565477729, "skip_count": 0.0, "step": 4246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0005586242602923081, "loss": 0.0041, "macro_f1": 0.3333333432674408, "num_tokens": 6205628.0, "repeat_count": 0.0, "routers_loss": 0.002490787534043193, "skip_count": 0.0, "step": 4248, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.710852148579754, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.2412109375, "learning_rate": 0.000558239066909644, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 6208378.0, "repeat_count": 1.0, "routers_loss": 0.007209382019937038, "skip_count": 1.0, "step": 4250, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 24.722505462490897, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1298828125, "learning_rate": 0.0005578538384837533, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6211018.0, "repeat_count": 1.0, "routers_loss": 0.004274892155081034, "skip_count": 0.0, "step": 4252, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 24.73415877640204, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.12158203125, "learning_rate": 0.0005574685752464334, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 6213833.0, "repeat_count": 3.0, "routers_loss": 0.011757797561585903, "skip_count": 12.0, "step": 4254, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 24.745812090313184, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.000557083277429502, "loss": 0.0056, "macro_f1": 1.0, "num_tokens": 6216668.0, "repeat_count": 1.0, "routers_loss": 0.0065588620491325855, "skip_count": 4.0, "step": 4256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.757465404224327, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0849609375, "learning_rate": 0.0005566979452647982, "loss": 0.0051, "macro_f1": 0.6616915464401245, "num_tokens": 6219252.0, "repeat_count": 1.0, "routers_loss": 0.026186438277363777, "skip_count": 2.0, "step": 4258, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 24.76911871813547, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.10498046875, "learning_rate": 0.0005563125789841814, "loss": 0.0047, "macro_f1": 0.9470900297164917, "num_tokens": 6221908.0, "repeat_count": 1.0, "routers_loss": 0.011956623755395412, "skip_count": 4.0, "step": 4260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1865234375, "learning_rate": 0.0005559271788195318, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6225798.0, "repeat_count": 0.0, "routers_loss": 0.007607071660459042, "skip_count": 1.0, "step": 4262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.0005555417450027498, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6228923.0, "repeat_count": 0.0, "routers_loss": 0.0004758680588565767, "skip_count": 0.0, "step": 4264, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.8040786598689, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.0005551562777657559, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 6231809.0, "repeat_count": 1.0, "routers_loss": 0.00765147153288126, "skip_count": 2.0, "step": 4266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 24.815731973780043, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.1240234375, "learning_rate": 0.0005547707773404911, "loss": 0.0055, "macro_f1": 0.6139194369316101, "num_tokens": 6234809.0, "repeat_count": 0.0, "routers_loss": 0.012789309024810791, "skip_count": 4.0, "step": 4268, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 24.827385287691186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.0005543852439589161, "loss": 0.0059, "macro_f1": 1.0, "num_tokens": 6237725.0, "repeat_count": 1.0, "routers_loss": 0.007457905448973179, "skip_count": 7.0, "step": 4270, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0005539996778530115, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 6240181.0, "repeat_count": 0.0, "routers_loss": 0.0031702020205557346, "skip_count": 2.0, "step": 4272, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.850691915513472, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09716796875, "learning_rate": 0.0005536140792547776, "loss": 0.0033, "macro_f1": 1.0, "num_tokens": 6243058.0, "repeat_count": 1.0, "routers_loss": 0.006566284690052271, "skip_count": 1.0, "step": 4274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.86234522942462, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.0005532284483962341, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6246048.0, "repeat_count": 0.0, "routers_loss": 0.0019324725726619363, "skip_count": 1.0, "step": 4276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 24.873998543335762, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1416015625, "learning_rate": 0.0005528427855094205, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 6248808.0, "repeat_count": 0.0, "routers_loss": 0.003190638730302453, "skip_count": 1.0, "step": 4278, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.885651857246906, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07177734375, "learning_rate": 0.0005524570908263952, "loss": 0.0067, "macro_f1": 1.0, "num_tokens": 6251682.0, "repeat_count": 2.0, "routers_loss": 0.0030710892751812935, "skip_count": 2.0, "step": 4280, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 24.89730517115805, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.138671875, "learning_rate": 0.000552071364579236, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 6254825.0, "repeat_count": 1.0, "routers_loss": 0.002710547298192978, "skip_count": 1.0, "step": 4282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1181640625, "learning_rate": 0.0005516856070000393, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6257959.0, "repeat_count": 0.0, "routers_loss": 0.0026841454673558474, "skip_count": 2.0, "step": 4284, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08251953125, "learning_rate": 0.0005512998183209206, "loss": 0.004, "macro_f1": 0.3333333432674408, "num_tokens": 6260577.0, "repeat_count": 0.0, "routers_loss": 0.0009103236370719969, "skip_count": 0.0, "step": 4286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09521484375, "learning_rate": 0.0005509139987740143, "loss": 0.0085, "macro_f1": 0.3333333432674408, "num_tokens": 6263706.0, "repeat_count": 0.0, "routers_loss": 0.005102985538542271, "skip_count": 0.0, "step": 4288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0005505281485914731, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6266388.0, "repeat_count": 0.0, "routers_loss": 0.0021953575778752565, "skip_count": 0.0, "step": 4290, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 24.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.072265625, "learning_rate": 0.0005501422680054684, "loss": 0.0026, "macro_f1": 0.6666666865348816, "num_tokens": 6269566.0, "repeat_count": 0.0, "routers_loss": 0.007061024662107229, "skip_count": 2.0, "step": 4292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.0005497563572481895, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6272492.0, "repeat_count": 0.0, "routers_loss": 0.00034200013033114374, "skip_count": 0.0, "step": 4294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 24.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.000549370416551844, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6275433.0, "repeat_count": 0.0, "routers_loss": 0.0013249301118776202, "skip_count": 0.0, "step": 4296, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 39.0, "epoch": 24.990531682447195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.154296875, "learning_rate": 0.0005489844461486578, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 6278139.0, "repeat_count": 5.0, "routers_loss": 0.0009300060919485986, "skip_count": 2.0, "step": 4298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.0005485984462708743, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 6280400.0, "repeat_count": 0.0, "routers_loss": 0.001073137973435223, "skip_count": 0.0, "step": 4300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.000548212417150755, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6283307.0, "repeat_count": 0.0, "routers_loss": 0.002807670971378684, "skip_count": 0.0, "step": 4302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07861328125, "learning_rate": 0.0005478263590205784, "loss": 0.003, "macro_f1": 0.6666666865348816, "num_tokens": 6286628.0, "repeat_count": 0.0, "routers_loss": 0.0018132972763851285, "skip_count": 2.0, "step": 4304, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 25.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.078125, "learning_rate": 0.0005474402721126414, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 6289971.0, "repeat_count": 1.0, "routers_loss": 0.0007753430400043726, "skip_count": 1.0, "step": 4306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.130859375, "learning_rate": 0.0005470541566592572, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6292524.0, "repeat_count": 0.0, "routers_loss": 0.004159194882959127, "skip_count": 1.0, "step": 4308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059326171875, "learning_rate": 0.0005466680128927568, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6295365.0, "repeat_count": 0.0, "routers_loss": 0.000578906387090683, "skip_count": 0.0, "step": 4310, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.06991988346686, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.232421875, "learning_rate": 0.0005462818410454882, "loss": 0.006, "macro_f1": 1.0, "num_tokens": 6297868.0, "repeat_count": 1.0, "routers_loss": 0.0034579674247652292, "skip_count": 3.0, "step": 4312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11376953125, "learning_rate": 0.0005458956413498159, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 6300817.0, "repeat_count": 0.0, "routers_loss": 0.012606903910636902, "skip_count": 2.0, "step": 4314, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.05810546875, "learning_rate": 0.0005455094140381217, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6303722.0, "repeat_count": 0.0, "routers_loss": 0.0035160391125828028, "skip_count": 1.0, "step": 4316, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0005451231593428035, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6306580.0, "repeat_count": 0.0, "routers_loss": 0.0009573212009854615, "skip_count": 0.0, "step": 4318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.109375, "learning_rate": 0.0005447368774962763, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 6309323.0, "repeat_count": 0.0, "routers_loss": 0.000933259550947696, "skip_count": 0.0, "step": 4320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0927734375, "learning_rate": 0.0005443505687309705, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 6311871.0, "repeat_count": 0.0, "routers_loss": 0.0017469670856371522, "skip_count": 0.0, "step": 4322, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 35.0, "epoch": 25.139839766933722, "f1_execute": 0.9836065173149109, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.15625, "learning_rate": 0.0005439642332793339, "loss": 0.0044, "macro_f1": 0.9469165205955505, "num_tokens": 6315823.0, "repeat_count": 2.0, "routers_loss": 0.018045924603939056, "skip_count": 4.0, "step": 4324, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 25.151493080844865, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.0005435778713738292, "loss": 0.0122, "macro_f1": 0.6666666865348816, "num_tokens": 6319133.0, "repeat_count": 1.0, "routers_loss": 0.0020323526114225388, "skip_count": 0.0, "step": 4326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 25.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12060546875, "learning_rate": 0.0005431914832469357, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 6321548.0, "repeat_count": 0.0, "routers_loss": 0.008756798692047596, "skip_count": 3.0, "step": 4328, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.17479970866715, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1240234375, "learning_rate": 0.0005428050691311483, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 6324166.0, "repeat_count": 1.0, "routers_loss": 0.003112410195171833, "skip_count": 3.0, "step": 4330, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 25.186453022578295, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07861328125, "learning_rate": 0.0005424186292589775, "loss": 0.0057, "macro_f1": 1.0, "num_tokens": 6326859.0, "repeat_count": 2.0, "routers_loss": 0.003284019883722067, "skip_count": 2.0, "step": 4332, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0005420321638629496, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6329789.0, "repeat_count": 0.0, "routers_loss": 0.0009520717430859804, "skip_count": 0.0, "step": 4334, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08349609375, "learning_rate": 0.0005416456731756057, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6332479.0, "repeat_count": 0.0, "routers_loss": 0.005411659833043814, "skip_count": 2.0, "step": 4336, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 25.221412964311725, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.0005412591574295027, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6335340.0, "repeat_count": 1.0, "routers_loss": 0.000482909323181957, "skip_count": 0.0, "step": 4338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.0005408726168572121, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6337660.0, "repeat_count": 0.0, "routers_loss": 0.002821375848725438, "skip_count": 0.0, "step": 4340, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 35.0, "epoch": 25.244719592134015, "f1_execute": 0.9846153855323792, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.11083984375, "learning_rate": 0.0005404860516913206, "loss": 0.0052, "macro_f1": 0.928205132484436, "num_tokens": 6340254.0, "repeat_count": 1.0, "routers_loss": 0.009596377611160278, "skip_count": 3.0, "step": 4342, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 25.256372906045158, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11962890625, "learning_rate": 0.0005400994621644294, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 6343128.0, "repeat_count": 1.0, "routers_loss": 0.00805640034377575, "skip_count": 4.0, "step": 4344, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0908203125, "learning_rate": 0.0005397128485091551, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 6345751.0, "repeat_count": 0.0, "routers_loss": 0.0008109475602395833, "skip_count": 1.0, "step": 4346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 25.279679533867444, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.197265625, "learning_rate": 0.0005393262109581278, "loss": 0.0046, "macro_f1": 0.5507246255874634, "num_tokens": 6348398.0, "repeat_count": 0.0, "routers_loss": 0.014267010614275932, "skip_count": 2.0, "step": 4348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.000538939549743993, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6351364.0, "repeat_count": 0.0, "routers_loss": 0.0031446784269064665, "skip_count": 1.0, "step": 4350, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 25.30298616168973, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08251953125, "learning_rate": 0.0005385528650994095, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6354196.0, "repeat_count": 1.0, "routers_loss": 0.0010975562036037445, "skip_count": 0.0, "step": 4352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0005381661572570508, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6356810.0, "repeat_count": 0.0, "routers_loss": 0.0012434590607881546, "skip_count": 0.0, "step": 4354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0005377794264496041, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 6359486.0, "repeat_count": 0.0, "routers_loss": 0.006992827635258436, "skip_count": 0.0, "step": 4356, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 25.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1328125, "learning_rate": 0.0005373926729097706, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6362783.0, "repeat_count": 0.0, "routers_loss": 0.00757464999333024, "skip_count": 5.0, "step": 4358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.087890625, "learning_rate": 0.0005370058968702651, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6365273.0, "repeat_count": 0.0, "routers_loss": 0.0023171750362962484, "skip_count": 2.0, "step": 4360, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 25.361252731245447, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10205078125, "learning_rate": 0.0005366190985638159, "loss": 0.0038, "macro_f1": 0.6666666865348816, "num_tokens": 6368066.0, "repeat_count": 1.0, "routers_loss": 0.0007172332843765616, "skip_count": 0.0, "step": 4362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.04443359375, "learning_rate": 0.0005362322782231647, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6371218.0, "repeat_count": 0.0, "routers_loss": 0.001014209003187716, "skip_count": 1.0, "step": 4364, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0005358454360810665, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6374145.0, "repeat_count": 0.0, "routers_loss": 0.003506881883367896, "skip_count": 0.0, "step": 4366, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 25.39621267297888, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.0869140625, "learning_rate": 0.0005354585723702892, "loss": 0.0049, "macro_f1": 0.9470900297164917, "num_tokens": 6377428.0, "repeat_count": 1.0, "routers_loss": 0.011506919749081135, "skip_count": 4.0, "step": 4368, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1474609375, "learning_rate": 0.0005350716873236143, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6380478.0, "repeat_count": 0.0, "routers_loss": 0.0010309527860954404, "skip_count": 0.0, "step": 4370, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 25.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.2578125, "learning_rate": 0.0005346847811738354, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6383367.0, "repeat_count": 2.0, "routers_loss": 0.0004843998176511377, "skip_count": 0.0, "step": 4372, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07568359375, "learning_rate": 0.0005342978541537591, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6386045.0, "repeat_count": 0.0, "routers_loss": 0.004218748304992914, "skip_count": 2.0, "step": 4374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.099609375, "learning_rate": 0.0005339109064962047, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6389357.0, "repeat_count": 0.0, "routers_loss": 0.0011573208030313253, "skip_count": 1.0, "step": 4376, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 25.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0005335239384340037, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 6392223.0, "repeat_count": 1.0, "routers_loss": 0.00198833248578012, "skip_count": 0.0, "step": 4378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 25.46613255644574, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.05078125, "learning_rate": 0.0005331369502000002, "loss": 0.0033, "macro_f1": 0.5507246255874634, "num_tokens": 6395252.0, "repeat_count": 0.0, "routers_loss": 0.014816648326814175, "skip_count": 2.0, "step": 4380, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0005327499420270497, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 6398336.0, "repeat_count": 0.0, "routers_loss": 0.000924952735658735, "skip_count": 0.0, "step": 4382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0712890625, "learning_rate": 0.0005323629141480207, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6401949.0, "repeat_count": 0.0, "routers_loss": 0.0013091614237055182, "skip_count": 0.0, "step": 4384, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 35.0, "epoch": 25.50109249817917, "f1_execute": 0.9836065173149109, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.12109375, "learning_rate": 0.0005319758667957928, "loss": 0.0055, "macro_f1": 0.9469165205955505, "num_tokens": 6404899.0, "repeat_count": 2.0, "routers_loss": 0.019175469875335693, "skip_count": 4.0, "step": 4386, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.512745812090312, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1376953125, "learning_rate": 0.0005315888002032578, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 6408491.0, "repeat_count": 1.0, "routers_loss": 0.007192625198513269, "skip_count": 2.0, "step": 4388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.21484375, "learning_rate": 0.0005312017146033185, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 6411363.0, "repeat_count": 0.0, "routers_loss": 0.0029335254803299904, "skip_count": 1.0, "step": 4390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.5360524399126, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.0005308146102288898, "loss": 0.0053, "macro_f1": 0.3333333432674408, "num_tokens": 6414544.0, "repeat_count": 0.0, "routers_loss": 0.0020698949228972197, "skip_count": 0.0, "step": 4392, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.547705753823745, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.0859375, "learning_rate": 0.0005304274873128974, "loss": 0.007, "macro_f1": 0.8837606906890869, "num_tokens": 6417788.0, "repeat_count": 2.0, "routers_loss": 0.026231681928038597, "skip_count": 2.0, "step": 4394, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 25.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0732421875, "learning_rate": 0.0005300403460882783, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6420711.0, "repeat_count": 0.0, "routers_loss": 0.006493912078440189, "skip_count": 3.0, "step": 4396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.13671875, "learning_rate": 0.000529653186787981, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6423626.0, "repeat_count": 0.0, "routers_loss": 0.0029219489078968763, "skip_count": 2.0, "step": 4398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0005292660096449638, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 6426631.0, "repeat_count": 0.0, "routers_loss": 0.0030647367238998413, "skip_count": 0.0, "step": 4400, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06884765625, "learning_rate": 0.0005288788148921968, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6428876.0, "repeat_count": 0.0, "routers_loss": 0.0065133762545883656, "skip_count": 2.0, "step": 4402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.087890625, "learning_rate": 0.00052849160276266, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6431577.0, "repeat_count": 0.0, "routers_loss": 0.006944406311959028, "skip_count": 2.0, "step": 4404, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 25.617625637290605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.0005281043734893444, "loss": 0.0042, "macro_f1": 1.0, "num_tokens": 6434472.0, "repeat_count": 2.0, "routers_loss": 0.004346081521362066, "skip_count": 1.0, "step": 4406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.000527717127305251, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6437379.0, "repeat_count": 0.0, "routers_loss": 0.010902500711381435, "skip_count": 2.0, "step": 4408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 33.0, "epoch": 25.64093226511289, "f1_execute": 0.9846153855323792, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.08642578125, "learning_rate": 0.0005273298644433907, "loss": 0.0044, "macro_f1": 0.6139194369316101, "num_tokens": 6440144.0, "repeat_count": 0.0, "routers_loss": 0.022242611274123192, "skip_count": 4.0, "step": 4410, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 25.652585579024034, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.0005269425851367851, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 6443554.0, "repeat_count": 2.0, "routers_loss": 0.0028225588612258434, "skip_count": 2.0, "step": 4412, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.664238892935177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0005265552896184652, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6446705.0, "repeat_count": 0.0, "routers_loss": 0.0008063282584771514, "skip_count": 0.0, "step": 4414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.000526167978121472, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 6450481.0, "repeat_count": 0.0, "routers_loss": 0.0009634968591853976, "skip_count": 0.0, "step": 4416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06787109375, "learning_rate": 0.0005257806508788557, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 6453545.0, "repeat_count": 0.0, "routers_loss": 0.0034965418744832277, "skip_count": 1.0, "step": 4418, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1279296875, "learning_rate": 0.0005253933081236768, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 6456323.0, "repeat_count": 0.0, "routers_loss": 0.00694741727784276, "skip_count": 1.0, "step": 4420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05126953125, "learning_rate": 0.0005250059500890042, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6459112.0, "repeat_count": 0.0, "routers_loss": 0.003361609298735857, "skip_count": 0.0, "step": 4422, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.722505462490897, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.142578125, "learning_rate": 0.0005246185770079166, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 6461777.0, "repeat_count": 1.0, "routers_loss": 0.001507434411905706, "skip_count": 2.0, "step": 4424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1552734375, "learning_rate": 0.0005242311891135016, "loss": 0.0056, "macro_f1": 0.3333333432674408, "num_tokens": 6464386.0, "repeat_count": 0.0, "routers_loss": 0.0008854904444888234, "skip_count": 0.0, "step": 4426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.14453125, "learning_rate": 0.0005238437866388555, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6466716.0, "repeat_count": 0.0, "routers_loss": 0.00269315205514431, "skip_count": 2.0, "step": 4428, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 25.757465404224327, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0005234563698170837, "loss": 0.0058, "macro_f1": 1.0, "num_tokens": 6469848.0, "repeat_count": 1.0, "routers_loss": 0.0012634476879611611, "skip_count": 1.0, "step": 4430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.0005230689388813, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6472363.0, "repeat_count": 0.0, "routers_loss": 0.004975263029336929, "skip_count": 0.0, "step": 4432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 25.780772032046613, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.11181640625, "learning_rate": 0.0005226814940646269, "loss": 0.0061, "macro_f1": 0.5507246255874634, "num_tokens": 6475164.0, "repeat_count": 0.0, "routers_loss": 0.01377043966203928, "skip_count": 2.0, "step": 4434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0005222940356001951, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 6477760.0, "repeat_count": 0.0, "routers_loss": 0.0015045310137793422, "skip_count": 0.0, "step": 4436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.0005219065637211435, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6480595.0, "repeat_count": 0.0, "routers_loss": 0.00023096184304449707, "skip_count": 0.0, "step": 4438, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 25.815731973780043, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.0005215190786606192, "loss": 0.0066, "macro_f1": 1.0, "num_tokens": 6483536.0, "repeat_count": 1.0, "routers_loss": 0.0027167934458702803, "skip_count": 4.0, "step": 4440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 25.827385287691186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.083984375, "learning_rate": 0.000521131580651777, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6486016.0, "repeat_count": 0.0, "routers_loss": 0.0022964212112128735, "skip_count": 1.0, "step": 4442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10595703125, "learning_rate": 0.0005207440699277798, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 6488793.0, "repeat_count": 0.0, "routers_loss": 0.0012706738198176026, "skip_count": 0.0, "step": 4444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 25.850691915513472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.000520356546721798, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6491448.0, "repeat_count": 0.0, "routers_loss": 0.001884019235149026, "skip_count": 4.0, "step": 4446, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 25.86234522942462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11279296875, "learning_rate": 0.0005199690112670093, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 6494555.0, "repeat_count": 2.0, "routers_loss": 0.0014895298518240452, "skip_count": 0.0, "step": 4448, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.873998543335762, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0005195814637965991, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6498211.0, "repeat_count": 0.0, "routers_loss": 0.0010786742204800248, "skip_count": 0.0, "step": 4450, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.885651857246906, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07421875, "learning_rate": 0.00051919390454376, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6501414.0, "repeat_count": 1.0, "routers_loss": 0.0038931029848754406, "skip_count": 3.0, "step": 4452, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 25.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12158203125, "learning_rate": 0.0005188063337416915, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6504148.0, "repeat_count": 0.0, "routers_loss": 0.008902371861040592, "skip_count": 3.0, "step": 4454, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.908958485069192, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09912109375, "learning_rate": 0.0005184187516236, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 6506987.0, "repeat_count": 2.0, "routers_loss": 0.006383785046637058, "skip_count": 4.0, "step": 4456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1025390625, "learning_rate": 0.0005180311584226991, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6509987.0, "repeat_count": 0.0, "routers_loss": 0.001914900611154735, "skip_count": 0.0, "step": 4458, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 25.93226511289148, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07861328125, "learning_rate": 0.0005176435543722088, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 6512890.0, "repeat_count": 2.0, "routers_loss": 0.006772174034267664, "skip_count": 2.0, "step": 4460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0751953125, "learning_rate": 0.0005172559397053553, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 6517503.0, "repeat_count": 0.0, "routers_loss": 0.0013125346740707755, "skip_count": 0.0, "step": 4462, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.134765625, "learning_rate": 0.0005168683146553719, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6520076.0, "repeat_count": 0.0, "routers_loss": 0.001289060222916305, "skip_count": 0.0, "step": 4464, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 25.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.0005164806794554979, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6522859.0, "repeat_count": 0.0, "routers_loss": 0.0005723516223952174, "skip_count": 0.0, "step": 4466, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 25.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.107421875, "learning_rate": 0.0005160930343389782, "loss": 0.0041, "macro_f1": 0.6666666865348816, "num_tokens": 6526263.0, "repeat_count": 0.0, "routers_loss": 0.0066565945744514465, "skip_count": 2.0, "step": 4468, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 25.990531682447195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.08984375, "learning_rate": 0.0005157053795390641, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6528984.0, "repeat_count": 1.0, "routers_loss": 0.0015940230805426836, "skip_count": 0.0, "step": 4470, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.000515317715289013, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 6531616.0, "repeat_count": 1.0, "routers_loss": 0.0030665129888802767, "skip_count": 0.0, "step": 4472, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08984375, "learning_rate": 0.0005149300418220875, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6534358.0, "repeat_count": 0.0, "routers_loss": 0.0010073521407321095, "skip_count": 1.0, "step": 4474, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.023306627822286, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.140625, "learning_rate": 0.0005145423593715557, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6537734.0, "repeat_count": 1.0, "routers_loss": 0.0012026336044073105, "skip_count": 0.0, "step": 4476, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 26.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1533203125, "learning_rate": 0.0005141546681706917, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 6541347.0, "repeat_count": 1.0, "routers_loss": 0.002558073727414012, "skip_count": 1.0, "step": 4478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0005137669684527743, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6544177.0, "repeat_count": 0.0, "routers_loss": 0.0004181562690064311, "skip_count": 0.0, "step": 4480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05908203125, "learning_rate": 0.0005133792604510874, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 6547890.0, "repeat_count": 0.0, "routers_loss": 0.0008044944261200726, "skip_count": 0.0, "step": 4482, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.0005129915443989206, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 6550898.0, "repeat_count": 0.0, "routers_loss": 0.006158736534416676, "skip_count": 2.0, "step": 4484, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0005126038205295673, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6553484.0, "repeat_count": 0.0, "routers_loss": 0.0015021997969597578, "skip_count": 0.0, "step": 4486, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 26.09322651128915, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.10498046875, "learning_rate": 0.0005122160890763266, "loss": 0.0047, "macro_f1": 0.5950249433517456, "num_tokens": 6556420.0, "repeat_count": 0.0, "routers_loss": 0.01464066281914711, "skip_count": 3.0, "step": 4488, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1767578125, "learning_rate": 0.0005118283502725014, "loss": 0.0062, "macro_f1": 0.6666666865348816, "num_tokens": 6559084.0, "repeat_count": 0.0, "routers_loss": 0.009348138235509396, "skip_count": 2.0, "step": 4490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1455078125, "learning_rate": 0.0005114406043513995, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 6561982.0, "repeat_count": 0.0, "routers_loss": 0.00521179148927331, "skip_count": 2.0, "step": 4492, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08056640625, "learning_rate": 0.0005110528515463328, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6564746.0, "repeat_count": 0.0, "routers_loss": 0.0020424220710992813, "skip_count": 0.0, "step": 4494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09326171875, "learning_rate": 0.0005106650920906171, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6567513.0, "repeat_count": 0.0, "routers_loss": 0.001969850854948163, "skip_count": 0.0, "step": 4496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.041015625, "learning_rate": 0.000510277326217573, "loss": 0.0115, "macro_f1": 0.6666666865348816, "num_tokens": 6570377.0, "repeat_count": 0.0, "routers_loss": 0.0006809664191678166, "skip_count": 1.0, "step": 4498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09619140625, "learning_rate": 0.0005098895541605238, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6573339.0, "repeat_count": 0.0, "routers_loss": 0.001985863782465458, "skip_count": 2.0, "step": 4500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07666015625, "learning_rate": 0.0005095017761527976, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6576137.0, "repeat_count": 0.0, "routers_loss": 0.006653652526438236, "skip_count": 2.0, "step": 4502, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.186453022578295, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.11572265625, "learning_rate": 0.0005091139924277252, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6579107.0, "repeat_count": 1.0, "routers_loss": 0.00208374229259789, "skip_count": 0.0, "step": 4504, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.198106336489438, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0005087262032186418, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 6582199.0, "repeat_count": 1.0, "routers_loss": 0.003741761902347207, "skip_count": 3.0, "step": 4506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.123046875, "learning_rate": 0.0005083384087588849, "loss": 0.0051, "macro_f1": 0.3333333432674408, "num_tokens": 6584985.0, "repeat_count": 0.0, "routers_loss": 0.0061462377198040485, "skip_count": 0.0, "step": 4508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09130859375, "learning_rate": 0.0005079506092817958, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6587810.0, "repeat_count": 0.0, "routers_loss": 0.006289886310696602, "skip_count": 0.0, "step": 4510, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 26.23306627822287, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.0005075628050207187, "loss": 0.0049, "macro_f1": 1.0, "num_tokens": 6590243.0, "repeat_count": 1.0, "routers_loss": 0.009318302385509014, "skip_count": 4.0, "step": 4512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.0005071749962090004, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 6593335.0, "repeat_count": 0.0, "routers_loss": 0.0008890405297279358, "skip_count": 0.0, "step": 4514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 26.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.087890625, "learning_rate": 0.000506787183079991, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6596553.0, "repeat_count": 0.0, "routers_loss": 0.013275794684886932, "skip_count": 3.0, "step": 4516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0791015625, "learning_rate": 0.0005063993658670425, "loss": 0.0027, "macro_f1": 0.6666666865348816, "num_tokens": 6599239.0, "repeat_count": 0.0, "routers_loss": 0.00296062626875937, "skip_count": 1.0, "step": 4518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09228515625, "learning_rate": 0.0005060115448035098, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 6601740.0, "repeat_count": 0.0, "routers_loss": 0.002070694463327527, "skip_count": 2.0, "step": 4520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0517578125, "learning_rate": 0.0005056237201227502, "loss": 0.0027, "macro_f1": 0.3333333432674408, "num_tokens": 6604426.0, "repeat_count": 0.0, "routers_loss": 0.00040311901830136776, "skip_count": 0.0, "step": 4522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0947265625, "learning_rate": 0.0005052358920581229, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6607224.0, "repeat_count": 0.0, "routers_loss": 0.000870975898578763, "skip_count": 2.0, "step": 4524, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.314639475600874, "f1_execute": 0.9846153855323792, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.16015625, "learning_rate": 0.0005048480608429893, "loss": 0.0064, "macro_f1": 0.8837606906890869, "num_tokens": 6610154.0, "repeat_count": 2.0, "routers_loss": 0.02137824334204197, "skip_count": 2.0, "step": 4526, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.326292789512017, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0005044602267107125, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 6613157.0, "repeat_count": 1.0, "routers_loss": 0.0020982506684958935, "skip_count": 2.0, "step": 4528, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0005040723898946574, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 6615980.0, "repeat_count": 0.0, "routers_loss": 0.0010950990254059434, "skip_count": 0.0, "step": 4530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.349599417334304, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06396484375, "learning_rate": 0.0005036845506281908, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6618826.0, "repeat_count": 0.0, "routers_loss": 0.0013156067579984665, "skip_count": 1.0, "step": 4532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11865234375, "learning_rate": 0.0005032967091446809, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 6621464.0, "repeat_count": 0.0, "routers_loss": 0.0011440361849963665, "skip_count": 2.0, "step": 4534, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.000502908865677497, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6624645.0, "repeat_count": 0.0, "routers_loss": 0.0037283364217728376, "skip_count": 0.0, "step": 4536, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.384559359067733, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0869140625, "learning_rate": 0.0005025210204600096, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 6627632.0, "repeat_count": 1.0, "routers_loss": 0.002676889766007662, "skip_count": 0.0, "step": 4538, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 39.0, "epoch": 26.39621267297888, "f1_execute": 0.9824560880661011, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0966796875, "learning_rate": 0.0005021331737255904, "loss": 0.0062, "macro_f1": 0.9274853467941284, "num_tokens": 6630277.0, "repeat_count": 5.0, "routers_loss": 0.006609564647078514, "skip_count": 3.0, "step": 4540, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 26.407865986890023, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.047607421875, "learning_rate": 0.0005017453257076119, "loss": 0.0044, "macro_f1": 1.0, "num_tokens": 6632616.0, "repeat_count": 2.0, "routers_loss": 0.005354173015803099, "skip_count": 2.0, "step": 4542, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 26.419519300801166, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10009765625, "learning_rate": 0.0005013574766394478, "loss": 0.0051, "macro_f1": 1.0, "num_tokens": 6635378.0, "repeat_count": 2.0, "routers_loss": 0.00598987378180027, "skip_count": 2.0, "step": 4544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.060791015625, "learning_rate": 0.0005009696267544715, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 6638101.0, "repeat_count": 0.0, "routers_loss": 0.0011447870638221502, "skip_count": 0.0, "step": 4546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.000500581776286058, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6641128.0, "repeat_count": 0.0, "routers_loss": 0.008198265917599201, "skip_count": 2.0, "step": 4548, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.06884765625, "learning_rate": 0.0005001939254675818, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 6644365.0, "repeat_count": 1.0, "routers_loss": 0.002710287692025304, "skip_count": 0.0, "step": 4550, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.12353515625, "learning_rate": 0.0004998060745324181, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 6647168.0, "repeat_count": 0.0, "routers_loss": 0.001622092560864985, "skip_count": 1.0, "step": 4552, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0004994182237139422, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6649771.0, "repeat_count": 0.0, "routers_loss": 0.0025877850130200386, "skip_count": 0.0, "step": 4554, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.095703125, "learning_rate": 0.0004990303732455285, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 6653587.0, "repeat_count": 0.0, "routers_loss": 0.0015113765839487314, "skip_count": 0.0, "step": 4556, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.50109249817917, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07568359375, "learning_rate": 0.0004986425233605524, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6656188.0, "repeat_count": 1.0, "routers_loss": 0.0010426775552332401, "skip_count": 0.0, "step": 4558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06005859375, "learning_rate": 0.0004982546742923883, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6659331.0, "repeat_count": 0.0, "routers_loss": 0.0015704369870945811, "skip_count": 0.0, "step": 4560, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.524399126001455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10546875, "learning_rate": 0.0004978668262744097, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6661840.0, "repeat_count": 1.0, "routers_loss": 0.005690149962902069, "skip_count": 2.0, "step": 4562, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 26.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.11328125, "learning_rate": 0.0004974789795399907, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 6665533.0, "repeat_count": 2.0, "routers_loss": 0.0028237763326615095, "skip_count": 7.0, "step": 4564, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 26.547705753823745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1640625, "learning_rate": 0.0004970911343225031, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 6668967.0, "repeat_count": 2.0, "routers_loss": 0.004320870153605938, "skip_count": 2.0, "step": 4566, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0004967032908553191, "loss": 0.0052, "macro_f1": 0.3333333432674408, "num_tokens": 6672117.0, "repeat_count": 0.0, "routers_loss": 0.00045716422027908266, "skip_count": 0.0, "step": 4568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1181640625, "learning_rate": 0.0004963154493718091, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 6675140.0, "repeat_count": 0.0, "routers_loss": 0.0010262487921863794, "skip_count": 0.0, "step": 4570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1611328125, "learning_rate": 0.0004959276101053426, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6677564.0, "repeat_count": 0.0, "routers_loss": 0.0017529343022033572, "skip_count": 1.0, "step": 4572, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.0004955397732892878, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 6679977.0, "repeat_count": 0.0, "routers_loss": 0.00231762882322073, "skip_count": 0.0, "step": 4574, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 26.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08544921875, "learning_rate": 0.0004951519391570108, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6683445.0, "repeat_count": 0.0, "routers_loss": 0.004312318749725819, "skip_count": 5.0, "step": 4576, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.617625637290605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0004947641079418773, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 6686623.0, "repeat_count": 0.0, "routers_loss": 0.0009408554760739207, "skip_count": 0.0, "step": 4578, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 26.629278951201748, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0771484375, "learning_rate": 0.0004943762798772498, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 6689485.0, "repeat_count": 1.0, "routers_loss": 0.005510455463081598, "skip_count": 1.0, "step": 4580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 26.64093226511289, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10009765625, "learning_rate": 0.0004939884551964902, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6692312.0, "repeat_count": 0.0, "routers_loss": 0.010624443180859089, "skip_count": 5.0, "step": 4582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.0004936006341329575, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 6696012.0, "repeat_count": 0.0, "routers_loss": 0.0035508403088897467, "skip_count": 1.0, "step": 4584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.664238892935177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.0004932128169200091, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6698957.0, "repeat_count": 0.0, "routers_loss": 0.003411472076550126, "skip_count": 1.0, "step": 4586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05078125, "learning_rate": 0.0004928250037909998, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 6701999.0, "repeat_count": 0.0, "routers_loss": 0.0012003252049908042, "skip_count": 0.0, "step": 4588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1533203125, "learning_rate": 0.0004924371949792814, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6704825.0, "repeat_count": 0.0, "routers_loss": 0.0009192083380185068, "skip_count": 2.0, "step": 4590, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.699198834668607, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.0004920493907182043, "loss": 0.005, "macro_f1": 0.6666666865348816, "num_tokens": 6707676.0, "repeat_count": 1.0, "routers_loss": 0.0010756748961284757, "skip_count": 0.0, "step": 4592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 26.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.0004916615912411151, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6710356.0, "repeat_count": 0.0, "routers_loss": 0.006558449473232031, "skip_count": 4.0, "step": 4594, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 26.722505462490897, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09326171875, "learning_rate": 0.0004912737967813582, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 6713767.0, "repeat_count": 1.0, "routers_loss": 0.013518212363123894, "skip_count": 4.0, "step": 4596, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0004908860075722747, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6716756.0, "repeat_count": 0.0, "routers_loss": 0.0005307801766321063, "skip_count": 0.0, "step": 4598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11572265625, "learning_rate": 0.0004904982238472025, "loss": 0.0047, "macro_f1": 0.3333333432674408, "num_tokens": 6719616.0, "repeat_count": 0.0, "routers_loss": 0.001794300740584731, "skip_count": 0.0, "step": 4600, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.757465404224327, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09130859375, "learning_rate": 0.0004901104458394764, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6722338.0, "repeat_count": 1.0, "routers_loss": 0.005717587191611528, "skip_count": 0.0, "step": 4602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.875, "avg_layers": 29.0, "epoch": 26.76911871813547, "f1_execute": 0.9824560880661011, "f1_repeat": 0.0, "f1_skip": 0.9333333373069763, "grad_norm": 0.1259765625, "learning_rate": 0.000489722673782427, "loss": 0.0053, "macro_f1": 0.6385964751243591, "num_tokens": 6724786.0, "repeat_count": 0.0, "routers_loss": 0.01890202797949314, "skip_count": 8.0, "step": 4604, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.780772032046613, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1591796875, "learning_rate": 0.0004893349079093829, "loss": 0.0063, "macro_f1": 1.0, "num_tokens": 6727405.0, "repeat_count": 1.0, "routers_loss": 0.00336140813305974, "skip_count": 2.0, "step": 4606, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 26.792425345957756, "f1_execute": 0.9836065173149109, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 0.12109375, "learning_rate": 0.0004889471484536672, "loss": 0.0043, "macro_f1": 0.8834244608879089, "num_tokens": 6730372.0, "repeat_count": 2.0, "routers_loss": 0.02585887722671032, "skip_count": 4.0, "step": 4608, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.8040786598689, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09619140625, "learning_rate": 0.0004885593956486005, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 6734239.0, "repeat_count": 1.0, "routers_loss": 0.0013422230258584023, "skip_count": 0.0, "step": 4610, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.815731973780043, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1123046875, "learning_rate": 0.00048817164972749855, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6737026.0, "repeat_count": 1.0, "routers_loss": 0.003054425586014986, "skip_count": 0.0, "step": 4612, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.827385287691186, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07861328125, "learning_rate": 0.00048778391092367345, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 6739843.0, "repeat_count": 0.0, "routers_loss": 0.0023217457346618176, "skip_count": 1.0, "step": 4614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.00048739617947043273, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 6742765.0, "repeat_count": 0.0, "routers_loss": 0.002601562300696969, "skip_count": 0.0, "step": 4616, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.850691915513472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07275390625, "learning_rate": 0.00048700845560107947, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6745640.0, "repeat_count": 0.0, "routers_loss": 0.0006664617685601115, "skip_count": 0.0, "step": 4618, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 26.86234522942462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1708984375, "learning_rate": 0.0004866207395489126, "loss": 0.0072, "macro_f1": 1.0, "num_tokens": 6748435.0, "repeat_count": 1.0, "routers_loss": 0.005629207007586956, "skip_count": 4.0, "step": 4620, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.873998543335762, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.078125, "learning_rate": 0.0004862330315472258, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 6752144.0, "repeat_count": 1.0, "routers_loss": 0.0026369986589998007, "skip_count": 2.0, "step": 4622, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 26.885651857246906, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09814453125, "learning_rate": 0.0004858453318293084, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6754884.0, "repeat_count": 2.0, "routers_loss": 0.0012843635631725192, "skip_count": 0.0, "step": 4624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10107421875, "learning_rate": 0.0004854576406284443, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6757441.0, "repeat_count": 0.0, "routers_loss": 0.004122287966310978, "skip_count": 2.0, "step": 4626, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.908958485069192, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.103515625, "learning_rate": 0.00048506995817791264, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6759973.0, "repeat_count": 1.0, "routers_loss": 0.00467624980956316, "skip_count": 0.0, "step": 4628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1142578125, "learning_rate": 0.00048468228471098705, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6762666.0, "repeat_count": 0.0, "routers_loss": 0.0008759537013247609, "skip_count": 0.0, "step": 4630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 26.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.072265625, "learning_rate": 0.00048429462046093585, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 6765330.0, "repeat_count": 0.0, "routers_loss": 0.0007822737679816782, "skip_count": 1.0, "step": 4632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09375, "learning_rate": 0.00048390696566102194, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 6769079.0, "repeat_count": 0.0, "routers_loss": 0.0042245578952133656, "skip_count": 0.0, "step": 4634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 26.955571740713765, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.00048351932054450215, "loss": 0.004, "macro_f1": 0.32863849401474, "num_tokens": 6771835.0, "repeat_count": 0.0, "routers_loss": 0.01703135296702385, "skip_count": 1.0, "step": 4636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 26.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.166015625, "learning_rate": 0.00048313168534462803, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6774370.0, "repeat_count": 0.0, "routers_loss": 0.012572955340147018, "skip_count": 2.0, "step": 4638, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 26.97887836853605, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.1005859375, "learning_rate": 0.0004827440602946446, "loss": 0.005, "macro_f1": 0.5950249433517456, "num_tokens": 6777232.0, "repeat_count": 0.0, "routers_loss": 0.019976945593953133, "skip_count": 3.0, "step": 4640, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 26.990531682447195, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.04296875, "learning_rate": 0.00048235644562779137, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 6779893.0, "repeat_count": 1.0, "routers_loss": 0.001097018481232226, "skip_count": 0.0, "step": 4642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.169921875, "learning_rate": 0.00048196884157730084, "loss": 0.0071, "macro_f1": 0.3333333432674408, "num_tokens": 6782832.0, "repeat_count": 0.0, "routers_loss": 0.0007213001372292638, "skip_count": 0.0, "step": 4644, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.126953125, "learning_rate": 0.0004815812483764, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 6785697.0, "repeat_count": 0.0, "routers_loss": 0.005494846496731043, "skip_count": 1.0, "step": 4646, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.023306627822286, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07568359375, "learning_rate": 0.0004811936662583086, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 6788304.0, "repeat_count": 1.0, "routers_loss": 0.0032515893690288067, "skip_count": 3.0, "step": 4648, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 27.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10400390625, "learning_rate": 0.00048080609545624004, "loss": 0.0064, "macro_f1": 1.0, "num_tokens": 6790751.0, "repeat_count": 2.0, "routers_loss": 0.001884431461803615, "skip_count": 2.0, "step": 4650, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.046613255644573, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07177734375, "learning_rate": 0.00048041853620340094, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 6793763.0, "repeat_count": 1.0, "routers_loss": 0.0023580151610076427, "skip_count": 0.0, "step": 4652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0634765625, "learning_rate": 0.0004800309887329907, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 6796436.0, "repeat_count": 0.0, "routers_loss": 0.00044709484791383147, "skip_count": 0.0, "step": 4654, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.115234375, "learning_rate": 0.0004796434532782021, "loss": 0.0032, "macro_f1": 0.3333333432674408, "num_tokens": 6799525.0, "repeat_count": 0.0, "routers_loss": 0.0017301564803346992, "skip_count": 0.0, "step": 4656, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0634765625, "learning_rate": 0.0004792559300722202, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6803111.0, "repeat_count": 0.0, "routers_loss": 0.006994984112679958, "skip_count": 2.0, "step": 4658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.000478868419348223, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 6805583.0, "repeat_count": 0.0, "routers_loss": 0.0008382649393752217, "skip_count": 0.0, "step": 4660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08642578125, "learning_rate": 0.0004784809213393809, "loss": 0.0058, "macro_f1": 0.6666666865348816, "num_tokens": 6808426.0, "repeat_count": 0.0, "routers_loss": 0.0015948698855936527, "skip_count": 2.0, "step": 4662, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.116533139111436, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0986328125, "learning_rate": 0.00047809343627885654, "loss": 0.0063, "macro_f1": 0.6666666865348816, "num_tokens": 6811507.0, "repeat_count": 0.0, "routers_loss": 0.008394718170166016, "skip_count": 3.0, "step": 4664, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.12818645302258, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09033203125, "learning_rate": 0.000477705964399805, "loss": 0.004, "macro_f1": 0.661835789680481, "num_tokens": 6814283.0, "repeat_count": 1.0, "routers_loss": 0.00974660087376833, "skip_count": 1.0, "step": 4666, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10888671875, "learning_rate": 0.00047731850593537314, "loss": 0.0046, "macro_f1": 0.3333333432674408, "num_tokens": 6817804.0, "repeat_count": 0.0, "routers_loss": 0.0012923756148666143, "skip_count": 0.0, "step": 4668, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07373046875, "learning_rate": 0.0004769310611187001, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6820597.0, "repeat_count": 0.0, "routers_loss": 0.002028755145147443, "skip_count": 0.0, "step": 4670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.00047654363018291633, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6823457.0, "repeat_count": 0.0, "routers_loss": 0.0014432461466640234, "skip_count": 0.0, "step": 4672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.05224609375, "learning_rate": 0.0004761562133611446, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 6826407.0, "repeat_count": 0.0, "routers_loss": 0.0011223535984754562, "skip_count": 0.0, "step": 4674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06494140625, "learning_rate": 0.0004757688108864986, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 6828958.0, "repeat_count": 0.0, "routers_loss": 0.0012660097563639283, "skip_count": 0.0, "step": 4676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1298828125, "learning_rate": 0.00047538142299208345, "loss": 0.0067, "macro_f1": 0.6666666865348816, "num_tokens": 6831617.0, "repeat_count": 0.0, "routers_loss": 0.0020474654156714678, "skip_count": 2.0, "step": 4678, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0751953125, "learning_rate": 0.0004749940499109959, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 6834158.0, "repeat_count": 0.0, "routers_loss": 0.00547066843137145, "skip_count": 1.0, "step": 4680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.00047460669187632324, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 6837184.0, "repeat_count": 0.0, "routers_loss": 0.0031035973224788904, "skip_count": 0.0, "step": 4682, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.23306627822287, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1337890625, "learning_rate": 0.0004742193491211443, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6839686.0, "repeat_count": 0.0, "routers_loss": 0.0064178756438195705, "skip_count": 0.0, "step": 4684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.244719592134015, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.048828125, "learning_rate": 0.0004738320218785281, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 6842567.0, "repeat_count": 0.0, "routers_loss": 0.001809359877370298, "skip_count": 2.0, "step": 4686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1064453125, "learning_rate": 0.00047344471038153485, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6845601.0, "repeat_count": 0.0, "routers_loss": 0.001034492626786232, "skip_count": 1.0, "step": 4688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.2680262199563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.050537109375, "learning_rate": 0.000473057414863215, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 6848975.0, "repeat_count": 0.0, "routers_loss": 0.0030908610206097364, "skip_count": 1.0, "step": 4690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 27.279679533867444, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.2177734375, "learning_rate": 0.00047267013555660935, "loss": 0.0047, "macro_f1": 0.5950249433517456, "num_tokens": 6852854.0, "repeat_count": 0.0, "routers_loss": 0.015343901701271534, "skip_count": 3.0, "step": 4692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.291332847778587, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.00047228287269474923, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 6855164.0, "repeat_count": 0.0, "routers_loss": 0.0033314316533505917, "skip_count": 2.0, "step": 4694, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.30298616168973, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.00047189562651065565, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 6858076.0, "repeat_count": 0.0, "routers_loss": 0.0026721847243607044, "skip_count": 2.0, "step": 4696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07421875, "learning_rate": 0.00047150839723734007, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 6861030.0, "repeat_count": 0.0, "routers_loss": 0.0034843729808926582, "skip_count": 2.0, "step": 4698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.326292789512017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0986328125, "learning_rate": 0.00047112118510780334, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 6863898.0, "repeat_count": 0.0, "routers_loss": 0.002086199354380369, "skip_count": 2.0, "step": 4700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 27.33794610342316, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.140625, "learning_rate": 0.00047073399035503635, "loss": 0.0059, "macro_f1": 0.5950249433517456, "num_tokens": 6866583.0, "repeat_count": 0.0, "routers_loss": 0.016053685918450356, "skip_count": 3.0, "step": 4702, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.349599417334304, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1640625, "learning_rate": 0.0004703468132120193, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 6869196.0, "repeat_count": 1.0, "routers_loss": 0.009448041208088398, "skip_count": 4.0, "step": 4704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 27.361252731245447, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.099609375, "learning_rate": 0.00046995965391172175, "loss": 0.0041, "macro_f1": 0.5950249433517456, "num_tokens": 6872295.0, "repeat_count": 0.0, "routers_loss": 0.014905494637787342, "skip_count": 3.0, "step": 4706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.37290604515659, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08642578125, "learning_rate": 0.00046957251268710276, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6875051.0, "repeat_count": 0.0, "routers_loss": 0.0034398012794554234, "skip_count": 0.0, "step": 4708, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.384559359067733, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.058349609375, "learning_rate": 0.00046918538977111035, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6878227.0, "repeat_count": 1.0, "routers_loss": 0.0006006480543874204, "skip_count": 0.0, "step": 4710, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.39621267297888, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.064453125, "learning_rate": 0.00046879828539668164, "loss": 0.0033, "macro_f1": 1.0, "num_tokens": 6880879.0, "repeat_count": 1.0, "routers_loss": 0.0052195084281265736, "skip_count": 2.0, "step": 4712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0888671875, "learning_rate": 0.00046841119979674226, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 6883561.0, "repeat_count": 0.0, "routers_loss": 0.010053831152617931, "skip_count": 3.0, "step": 4714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.419519300801166, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0004680241332042072, "loss": 0.0048, "macro_f1": 0.3333333432674408, "num_tokens": 6887731.0, "repeat_count": 0.0, "routers_loss": 0.0008685563807375729, "skip_count": 0.0, "step": 4716, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.43117261471231, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.10986328125, "learning_rate": 0.00046763708585197946, "loss": 0.0037, "macro_f1": 0.6666666865348816, "num_tokens": 6890627.0, "repeat_count": 1.0, "routers_loss": 0.0028808380011469126, "skip_count": 0.0, "step": 4718, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.125, "learning_rate": 0.00046725005797295033, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6893779.0, "repeat_count": 0.0, "routers_loss": 0.004639009479433298, "skip_count": 1.0, "step": 4720, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.454479242534596, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.00046686304980000004, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 6896592.0, "repeat_count": 0.0, "routers_loss": 0.0038122113328427076, "skip_count": 0.0, "step": 4722, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.061279296875, "learning_rate": 0.0004664760615659963, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 6899402.0, "repeat_count": 0.0, "routers_loss": 0.0012432645307853818, "skip_count": 2.0, "step": 4724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11376953125, "learning_rate": 0.0004660890935037954, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 6902002.0, "repeat_count": 0.0, "routers_loss": 0.00065869081299752, "skip_count": 0.0, "step": 4726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0908203125, "learning_rate": 0.0004657021458462408, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 6904603.0, "repeat_count": 0.0, "routers_loss": 0.008703840896487236, "skip_count": 3.0, "step": 4728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11669921875, "learning_rate": 0.00046531521882616476, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 6907936.0, "repeat_count": 0.0, "routers_loss": 0.0005726252566091716, "skip_count": 0.0, "step": 4730, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.512745812090312, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0673828125, "learning_rate": 0.0004649283126763859, "loss": 0.0043, "macro_f1": 1.0, "num_tokens": 6911042.0, "repeat_count": 1.0, "routers_loss": 0.004650800488889217, "skip_count": 2.0, "step": 4732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.524399126001455, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09521484375, "learning_rate": 0.00046454142762971084, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 6914267.0, "repeat_count": 0.0, "routers_loss": 0.0028053419664502144, "skip_count": 1.0, "step": 4734, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 27.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.072265625, "learning_rate": 0.0004641545639189337, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 6917279.0, "repeat_count": 1.0, "routers_loss": 0.006830111145973206, "skip_count": 6.0, "step": 4736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 27.547705753823745, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10595703125, "learning_rate": 0.0004637677217768353, "loss": 0.0054, "macro_f1": 0.6666666865348816, "num_tokens": 6921983.0, "repeat_count": 0.0, "routers_loss": 0.007816185243427753, "skip_count": 5.0, "step": 4738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.55935906773489, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10693359375, "learning_rate": 0.00046338090143618427, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 6924385.0, "repeat_count": 0.0, "routers_loss": 0.0012272255262359977, "skip_count": 3.0, "step": 4740, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.57101238164603, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.146484375, "learning_rate": 0.0004629941031297349, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 6926811.0, "repeat_count": 1.0, "routers_loss": 0.002275339560583234, "skip_count": 0.0, "step": 4742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07373046875, "learning_rate": 0.0004626073270902295, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 6929780.0, "repeat_count": 0.0, "routers_loss": 0.0024967745412141085, "skip_count": 1.0, "step": 4744, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.0004622205735503961, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 6933275.0, "repeat_count": 0.0, "routers_loss": 0.0004423847422003746, "skip_count": 0.0, "step": 4746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.60597232337946, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.04345703125, "learning_rate": 0.00046183384274294925, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 6936138.0, "repeat_count": 0.0, "routers_loss": 0.0003538542950991541, "skip_count": 0.0, "step": 4748, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.617625637290605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.103515625, "learning_rate": 0.00046144713490059073, "loss": 0.0041, "macro_f1": 1.0, "num_tokens": 6939201.0, "repeat_count": 2.0, "routers_loss": 0.012201604433357716, "skip_count": 3.0, "step": 4750, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09228515625, "learning_rate": 0.0004610604502560071, "loss": 0.0065, "macro_f1": 0.6666666865348816, "num_tokens": 6941912.0, "repeat_count": 0.0, "routers_loss": 0.010459266602993011, "skip_count": 2.0, "step": 4752, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.64093226511289, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.06494140625, "learning_rate": 0.00046067378904187224, "loss": 0.004, "macro_f1": 1.0, "num_tokens": 6944965.0, "repeat_count": 1.0, "routers_loss": 0.001656921929679811, "skip_count": 3.0, "step": 4754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1806640625, "learning_rate": 0.00046028715149084486, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 6947777.0, "repeat_count": 0.0, "routers_loss": 0.0013016877928748727, "skip_count": 0.0, "step": 4756, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.664238892935177, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07080078125, "learning_rate": 0.00045990053783557063, "loss": 0.0031, "macro_f1": 0.6666666865348816, "num_tokens": 6950749.0, "repeat_count": 1.0, "routers_loss": 0.0007078209309838712, "skip_count": 0.0, "step": 4758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.67589220684632, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.0004595139483086797, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 6953578.0, "repeat_count": 0.0, "routers_loss": 0.0003502067702356726, "skip_count": 0.0, "step": 4760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.126953125, "learning_rate": 0.0004591273831427879, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 6956646.0, "repeat_count": 0.0, "routers_loss": 0.0004404402570798993, "skip_count": 0.0, "step": 4762, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.699198834668607, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08740234375, "learning_rate": 0.00045874084257049744, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 6960025.0, "repeat_count": 0.0, "routers_loss": 0.001545312232337892, "skip_count": 1.0, "step": 4764, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.2294921875, "learning_rate": 0.00045835432682439416, "loss": 0.0057, "macro_f1": 0.6666666865348816, "num_tokens": 6962694.0, "repeat_count": 0.0, "routers_loss": 0.0018008471233770251, "skip_count": 3.0, "step": 4766, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.722505462490897, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.203125, "learning_rate": 0.00045796783613705054, "loss": 0.007, "macro_f1": 1.0, "num_tokens": 6965964.0, "repeat_count": 1.0, "routers_loss": 0.005106959957629442, "skip_count": 2.0, "step": 4768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0859375, "learning_rate": 0.00045758137074102234, "loss": 0.0069, "macro_f1": 0.6666666865348816, "num_tokens": 6968776.0, "repeat_count": 0.0, "routers_loss": 0.0015406949678435922, "skip_count": 1.0, "step": 4770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1416015625, "learning_rate": 0.00045719493086885174, "loss": 0.007, "macro_f1": 0.6666666865348816, "num_tokens": 6972491.0, "repeat_count": 0.0, "routers_loss": 0.004358130972832441, "skip_count": 2.0, "step": 4772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07958984375, "learning_rate": 0.00045680851675306455, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 6975862.0, "repeat_count": 0.0, "routers_loss": 0.002154327230527997, "skip_count": 0.0, "step": 4774, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.76911871813547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.083984375, "learning_rate": 0.00045642212862617086, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 6978398.0, "repeat_count": 0.0, "routers_loss": 0.0009562767227180302, "skip_count": 0.0, "step": 4776, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0004560357667206664, "loss": 0.0031, "macro_f1": 0.3333333432674408, "num_tokens": 6981232.0, "repeat_count": 0.0, "routers_loss": 0.000804189417976886, "skip_count": 0.0, "step": 4778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11865234375, "learning_rate": 0.0004556494312690294, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 6984145.0, "repeat_count": 0.0, "routers_loss": 0.004992508329451084, "skip_count": 3.0, "step": 4780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.8040786598689, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.00045526312250372393, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 6986669.0, "repeat_count": 0.0, "routers_loss": 0.0018597353482618928, "skip_count": 0.0, "step": 4782, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 34.0, "epoch": 27.815731973780043, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.12109375, "learning_rate": 0.00045487684065719636, "loss": 0.0068, "macro_f1": 0.9470900297164917, "num_tokens": 6989718.0, "repeat_count": 1.0, "routers_loss": 0.020007360726594925, "skip_count": 4.0, "step": 4784, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.827385287691186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.07421875, "learning_rate": 0.0004544905859618783, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 6992152.0, "repeat_count": 1.0, "routers_loss": 0.0033076589461416006, "skip_count": 0.0, "step": 4786, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 27.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.091796875, "learning_rate": 0.00045410435865018425, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 6994433.0, "repeat_count": 0.0, "routers_loss": 0.006120073143392801, "skip_count": 3.0, "step": 4788, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 27.850691915513472, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0986328125, "learning_rate": 0.00045371815895451186, "loss": 0.0114, "macro_f1": 0.6666666865348816, "num_tokens": 6997070.0, "repeat_count": 1.0, "routers_loss": 0.0016110398573800921, "skip_count": 0.0, "step": 4790, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 27.86234522942462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.173828125, "learning_rate": 0.00045333198710724334, "loss": 0.0053, "macro_f1": 1.0, "num_tokens": 6999398.0, "repeat_count": 1.0, "routers_loss": 0.004037959035485983, "skip_count": 1.0, "step": 4792, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.873998543335762, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.05517578125, "learning_rate": 0.00045294584334074284, "loss": 0.0037, "macro_f1": 1.0, "num_tokens": 7002890.0, "repeat_count": 1.0, "routers_loss": 0.003166663460433483, "skip_count": 3.0, "step": 4794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.885651857246906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1513671875, "learning_rate": 0.00045255972788735873, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7005630.0, "repeat_count": 0.0, "routers_loss": 0.0022490755654871464, "skip_count": 1.0, "step": 4796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1259765625, "learning_rate": 0.0004521736409794215, "loss": 0.006, "macro_f1": 0.6666666865348816, "num_tokens": 7008960.0, "repeat_count": 0.0, "routers_loss": 0.0032524289563298225, "skip_count": 2.0, "step": 4798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09716796875, "learning_rate": 0.00045178758284924515, "loss": 0.0043, "macro_f1": 0.3333333432674408, "num_tokens": 7011629.0, "repeat_count": 0.0, "routers_loss": 0.0012420385610312223, "skip_count": 0.0, "step": 4800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.0004514015537291259, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 7014879.0, "repeat_count": 0.0, "routers_loss": 0.0014956974191591144, "skip_count": 0.0, "step": 4802, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 27.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1015625, "learning_rate": 0.0004510155538513423, "loss": 0.0045, "macro_f1": 0.6666666865348816, "num_tokens": 7017621.0, "repeat_count": 0.0, "routers_loss": 0.0029657999984920025, "skip_count": 1.0, "step": 4804, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 27.94391842680262, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0947265625, "learning_rate": 0.0004506295834481561, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 7020895.0, "repeat_count": 1.0, "routers_loss": 0.007186858914792538, "skip_count": 6.0, "step": 4806, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 27.955571740713765, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0810546875, "learning_rate": 0.00045024364275181056, "loss": 0.0039, "macro_f1": 1.0, "num_tokens": 7023531.0, "repeat_count": 1.0, "routers_loss": 0.0028000941965729, "skip_count": 1.0, "step": 4808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07080078125, "learning_rate": 0.0004498577319945317, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7026723.0, "repeat_count": 0.0, "routers_loss": 0.0030548216309398413, "skip_count": 0.0, "step": 4810, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 27.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1083984375, "learning_rate": 0.0004494718514085268, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 7029148.0, "repeat_count": 0.0, "routers_loss": 0.0010578898945823312, "skip_count": 0.0, "step": 4812, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 27.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.107421875, "learning_rate": 0.0004490860012259857, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 7031920.0, "repeat_count": 0.0, "routers_loss": 0.001718417857773602, "skip_count": 2.0, "step": 4814, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 28.0, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07421875, "learning_rate": 0.00044870018167907956, "loss": 0.0036, "macro_f1": 1.0, "num_tokens": 7034048.0, "repeat_count": 1.0, "routers_loss": 0.010512015782296658, "skip_count": 9.0, "step": 4816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07080078125, "learning_rate": 0.0004483143929999608, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7036884.0, "repeat_count": 0.0, "routers_loss": 0.00410064123570919, "skip_count": 1.0, "step": 4818, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 28.023306627822286, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.08447265625, "learning_rate": 0.0004479286354207641, "loss": 0.0048, "macro_f1": 1.0, "num_tokens": 7039802.0, "repeat_count": 1.0, "routers_loss": 0.011674715206027031, "skip_count": 5.0, "step": 4820, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.03495994173343, "f1_execute": 0.9855071902275085, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0004475429091736047, "loss": 0.0057, "macro_f1": 0.661835789680481, "num_tokens": 7042846.0, "repeat_count": 1.0, "routers_loss": 0.01471671275794506, "skip_count": 1.0, "step": 4822, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 28.046613255644573, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0004471572144905795, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 7045439.0, "repeat_count": 0.0, "routers_loss": 0.0031750360503792763, "skip_count": 6.0, "step": 4824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10107421875, "learning_rate": 0.00044677155160376586, "loss": 0.005, "macro_f1": 0.3333333432674408, "num_tokens": 7048154.0, "repeat_count": 0.0, "routers_loss": 0.004686862695962191, "skip_count": 0.0, "step": 4826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.06991988346686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1689453125, "learning_rate": 0.0004463859207452225, "loss": 0.0055, "macro_f1": 0.3333333432674408, "num_tokens": 7051567.0, "repeat_count": 0.0, "routers_loss": 0.001668520038947463, "skip_count": 0.0, "step": 4828, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.081573197378006, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06982421875, "learning_rate": 0.00044600032214698855, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7055225.0, "repeat_count": 0.0, "routers_loss": 0.00406037550419569, "skip_count": 1.0, "step": 4830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.09322651128915, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09912109375, "learning_rate": 0.000445614756041084, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7058125.0, "repeat_count": 0.0, "routers_loss": 0.0023233515676110983, "skip_count": 0.0, "step": 4832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.104879825200292, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08251953125, "learning_rate": 0.00044522922265950905, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7061123.0, "repeat_count": 0.0, "routers_loss": 0.002488519065082073, "skip_count": 0.0, "step": 4834, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.116533139111436, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.09423828125, "learning_rate": 0.00044484372223424414, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7064039.0, "repeat_count": 1.0, "routers_loss": 0.004502513445913792, "skip_count": 0.0, "step": 4836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 28.12818645302258, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08837890625, "learning_rate": 0.0004444582549972503, "loss": 0.0061, "macro_f1": 0.6666666865348816, "num_tokens": 7067293.0, "repeat_count": 0.0, "routers_loss": 0.009801373817026615, "skip_count": 4.0, "step": 4838, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.139839766933722, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.115234375, "learning_rate": 0.0004440728211804682, "loss": 0.0048, "macro_f1": 0.6666666865348816, "num_tokens": 7070042.0, "repeat_count": 0.0, "routers_loss": 0.002106228144839406, "skip_count": 1.0, "step": 4840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 28.151493080844865, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09716796875, "learning_rate": 0.0004436874210158186, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7073009.0, "repeat_count": 0.0, "routers_loss": 0.009361197240650654, "skip_count": 5.0, "step": 4842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.16314639475601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.061279296875, "learning_rate": 0.0004433020547352018, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 7076208.0, "repeat_count": 0.0, "routers_loss": 0.0015572059201076627, "skip_count": 0.0, "step": 4844, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.17479970866715, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10205078125, "learning_rate": 0.000442916722570498, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 7079764.0, "repeat_count": 0.0, "routers_loss": 0.005371993873268366, "skip_count": 2.0, "step": 4846, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.186453022578295, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.00044253142475356676, "loss": 0.0037, "macro_f1": 0.3333333432674408, "num_tokens": 7083519.0, "repeat_count": 0.0, "routers_loss": 0.0007959226495586336, "skip_count": 0.0, "step": 4848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.198106336489438, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06787109375, "learning_rate": 0.0004421461615162467, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 7086387.0, "repeat_count": 0.0, "routers_loss": 0.0005652094259858131, "skip_count": 0.0, "step": 4850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.20975965040058, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.185546875, "learning_rate": 0.0004417609330903561, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7089298.0, "repeat_count": 0.0, "routers_loss": 0.0011645951308310032, "skip_count": 0.0, "step": 4852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 28.221412964311725, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11962890625, "learning_rate": 0.0004413757397076919, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 7092227.0, "repeat_count": 0.0, "routers_loss": 0.006529609207063913, "skip_count": 3.0, "step": 4854, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.23306627822287, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0849609375, "learning_rate": 0.0004409905816000302, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7095406.0, "repeat_count": 1.0, "routers_loss": 0.000526046846061945, "skip_count": 0.0, "step": 4856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 35.0, "epoch": 28.244719592134015, "f1_execute": 0.9855071902275085, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 0.1904296875, "learning_rate": 0.0004406054589991258, "loss": 0.0047, "macro_f1": 0.5507246255874634, "num_tokens": 7098308.0, "repeat_count": 0.0, "routers_loss": 0.009125502780079842, "skip_count": 2.0, "step": 4858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.256372906045158, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0693359375, "learning_rate": 0.0004402203721367122, "loss": 0.0038, "macro_f1": 0.3333333432674408, "num_tokens": 7100967.0, "repeat_count": 0.0, "routers_loss": 0.005309605970978737, "skip_count": 0.0, "step": 4860, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.2680262199563, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0830078125, "learning_rate": 0.0004398353212445012, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7103895.0, "repeat_count": 1.0, "routers_loss": 0.007367740850895643, "skip_count": 0.0, "step": 4862, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 31.0, "epoch": 28.279679533867444, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07958984375, "learning_rate": 0.000439450306554183, "loss": 0.0051, "macro_f1": 0.6666666865348816, "num_tokens": 7106449.0, "repeat_count": 0.0, "routers_loss": 0.006797585170716047, "skip_count": 5.0, "step": 4864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.291332847778587, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1044921875, "learning_rate": 0.00043906532829742634, "loss": 0.0042, "macro_f1": 0.32863849401474, "num_tokens": 7109178.0, "repeat_count": 0.0, "routers_loss": 0.006597199942916632, "skip_count": 1.0, "step": 4866, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.30298616168973, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.07421875, "learning_rate": 0.0004386803867058775, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 7112078.0, "repeat_count": 1.0, "routers_loss": 0.0033808655571192503, "skip_count": 2.0, "step": 4868, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.314639475600874, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.0004382954820111613, "loss": 0.0028, "macro_f1": 0.3333333432674408, "num_tokens": 7114732.0, "repeat_count": 0.0, "routers_loss": 0.0007155827479436994, "skip_count": 0.0, "step": 4870, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 28.326292789512017, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.078125, "learning_rate": 0.0004379106144448798, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 7117582.0, "repeat_count": 2.0, "routers_loss": 0.0029089516028761864, "skip_count": 0.0, "step": 4872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.33794610342316, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.11865234375, "learning_rate": 0.0004375257842386131, "loss": 0.0057, "macro_f1": 0.3333333432674408, "num_tokens": 7120651.0, "repeat_count": 0.0, "routers_loss": 0.001652404316700995, "skip_count": 0.0, "step": 4874, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.349599417334304, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.057861328125, "learning_rate": 0.00043714099162391875, "loss": 0.0062, "macro_f1": 1.0, "num_tokens": 7124028.0, "repeat_count": 1.0, "routers_loss": 0.004947385285049677, "skip_count": 2.0, "step": 4876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.361252731245447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.13671875, "learning_rate": 0.00043675623683233137, "loss": 0.0059, "macro_f1": 0.3333333432674408, "num_tokens": 7126944.0, "repeat_count": 0.0, "routers_loss": 0.0008446392603218555, "skip_count": 0.0, "step": 4878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 34.0, "epoch": 28.37290604515659, "f1_execute": 0.9850746393203735, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.11669921875, "learning_rate": 0.0004363715200953633, "loss": 0.0048, "macro_f1": 0.5950249433517456, "num_tokens": 7129575.0, "repeat_count": 0.0, "routers_loss": 0.01222243346273899, "skip_count": 3.0, "step": 4880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.384559359067733, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06640625, "learning_rate": 0.0004359868416445036, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 7132712.0, "repeat_count": 0.0, "routers_loss": 0.0011892464244738221, "skip_count": 0.0, "step": 4882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.39621267297888, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08837890625, "learning_rate": 0.0004356022017112187, "loss": 0.0106, "macro_f1": 0.3333333432674408, "num_tokens": 7135306.0, "repeat_count": 0.0, "routers_loss": 0.001285142032429576, "skip_count": 0.0, "step": 4884, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.407865986890023, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10009765625, "learning_rate": 0.00043521760052695125, "loss": 0.006, "macro_f1": 0.3333333432674408, "num_tokens": 7138882.0, "repeat_count": 0.0, "routers_loss": 0.0014188806526362896, "skip_count": 0.0, "step": 4886, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.419519300801166, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.1279296875, "learning_rate": 0.0004348330383231212, "loss": 0.0062, "macro_f1": 0.3333333432674408, "num_tokens": 7142196.0, "repeat_count": 0.0, "routers_loss": 0.00043370871571823955, "skip_count": 0.0, "step": 4888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 28.43117261471231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0927734375, "learning_rate": 0.00043444851533112465, "loss": 0.0056, "macro_f1": 0.6666666865348816, "num_tokens": 7144774.0, "repeat_count": 0.0, "routers_loss": 0.0029518071096390486, "skip_count": 3.0, "step": 4890, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.442825928623453, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.0004340640317823342, "loss": 0.0034, "macro_f1": 0.3333333432674408, "num_tokens": 7147500.0, "repeat_count": 0.0, "routers_loss": 0.0007875750306993723, "skip_count": 0.0, "step": 4892, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 28.454479242534596, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.12890625, "learning_rate": 0.00043367958790809896, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 7150327.0, "repeat_count": 2.0, "routers_loss": 0.003170499112457037, "skip_count": 0.0, "step": 4894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.46613255644574, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08642578125, "learning_rate": 0.00043329518393974364, "loss": 0.0053, "macro_f1": 0.6666666865348816, "num_tokens": 7154047.0, "repeat_count": 0.0, "routers_loss": 0.0019943055231124163, "skip_count": 2.0, "step": 4896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.477785870356882, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.00043291082010856965, "loss": 0.003, "macro_f1": 0.3333333432674408, "num_tokens": 7156986.0, "repeat_count": 0.0, "routers_loss": 0.0050514740869402885, "skip_count": 0.0, "step": 4898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.489439184268026, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.06640625, "learning_rate": 0.00043252649664585343, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 7160070.0, "repeat_count": 0.0, "routers_loss": 0.004743956495076418, "skip_count": 2.0, "step": 4900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.50109249817917, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.068359375, "learning_rate": 0.00043214221378284787, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 7163071.0, "repeat_count": 0.0, "routers_loss": 0.0010389670496806502, "skip_count": 1.0, "step": 4902, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.512745812090312, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07470703125, "learning_rate": 0.00043175797175078103, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 7165405.0, "repeat_count": 0.0, "routers_loss": 0.0032993697095662355, "skip_count": 1.0, "step": 4904, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.524399126001455, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.052001953125, "learning_rate": 0.00043137377078085633, "loss": 0.0029, "macro_f1": 1.0, "num_tokens": 7168492.0, "repeat_count": 1.0, "routers_loss": 0.00513329217210412, "skip_count": 2.0, "step": 4906, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 33.0, "epoch": 28.5360524399126, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.10009765625, "learning_rate": 0.00043098961110425287, "loss": 0.0045, "macro_f1": 1.0, "num_tokens": 7171149.0, "repeat_count": 1.0, "routers_loss": 0.005999868270009756, "skip_count": 4.0, "step": 4908, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.547705753823745, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.00043060549295212446, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 7174067.0, "repeat_count": 1.0, "routers_loss": 0.001765047898516059, "skip_count": 0.0, "step": 4910, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 28.55935906773489, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0869140625, "learning_rate": 0.00043022141655560045, "loss": 0.0046, "macro_f1": 1.0, "num_tokens": 7176327.0, "repeat_count": 1.0, "routers_loss": 0.007636990863829851, "skip_count": 1.0, "step": 4912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.57101238164603, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.072265625, "learning_rate": 0.00042983738214578433, "loss": 0.0044, "macro_f1": 0.3333333432674408, "num_tokens": 7179197.0, "repeat_count": 0.0, "routers_loss": 0.0005872685578651726, "skip_count": 0.0, "step": 4914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.582665695557175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0859375, "learning_rate": 0.0004294533899537551, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 7182397.0, "repeat_count": 0.0, "routers_loss": 0.0015288848662748933, "skip_count": 1.0, "step": 4916, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.594319009468318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10009765625, "learning_rate": 0.0004290694402105659, "loss": 0.0064, "macro_f1": 0.6666666865348816, "num_tokens": 7185310.0, "repeat_count": 0.0, "routers_loss": 0.0017793180886656046, "skip_count": 1.0, "step": 4918, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.60597232337946, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.0673828125, "learning_rate": 0.00042868553314724423, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 7187961.0, "repeat_count": 1.0, "routers_loss": 0.0026765172369778156, "skip_count": 0.0, "step": 4920, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 37.0, "epoch": 28.617625637290605, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.1396484375, "learning_rate": 0.0004283016689947924, "loss": 0.005, "macro_f1": 1.0, "num_tokens": 7190643.0, "repeat_count": 2.0, "routers_loss": 0.0035569369792938232, "skip_count": 1.0, "step": 4922, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.629278951201748, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0478515625, "learning_rate": 0.00042791784798418644, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 7193507.0, "repeat_count": 0.0, "routers_loss": 0.002684474689885974, "skip_count": 0.0, "step": 4924, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 36.0, "epoch": 28.64093226511289, "f1_execute": 0.9841269850730896, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.0634765625, "learning_rate": 0.0004275340703463767, "loss": 0.0058, "macro_f1": 0.9280423521995544, "num_tokens": 7196393.0, "repeat_count": 2.0, "routers_loss": 0.016304628923535347, "skip_count": 3.0, "step": 4926, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 28.652585579024034, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0556640625, "learning_rate": 0.0004271503363122871, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7199499.0, "repeat_count": 0.0, "routers_loss": 0.014658570289611816, "skip_count": 9.0, "step": 4928, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 28.664238892935177, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1455078125, "learning_rate": 0.000426766646112816, "loss": 0.0039, "macro_f1": 0.6666666865348816, "num_tokens": 7201956.0, "repeat_count": 2.0, "routers_loss": 0.003816374810412526, "skip_count": 0.0, "step": 4930, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.67589220684632, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.1181640625, "learning_rate": 0.00042638299997883454, "loss": 0.0046, "macro_f1": 0.6666666865348816, "num_tokens": 7204570.0, "repeat_count": 1.0, "routers_loss": 0.0024246254470199347, "skip_count": 0.0, "step": 4932, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.687545520757464, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.07666015625, "learning_rate": 0.0004259993981411878, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 7207527.0, "repeat_count": 0.0, "routers_loss": 0.0029629329219460487, "skip_count": 0.0, "step": 4934, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 28.699198834668607, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.279296875, "learning_rate": 0.00042561584083069425, "loss": 0.0054, "macro_f1": 1.0, "num_tokens": 7210447.0, "repeat_count": 2.0, "routers_loss": 0.002321582054719329, "skip_count": 2.0, "step": 4936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.710852148579754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.0004252323282781453, "loss": 0.0055, "macro_f1": 0.6666666865348816, "num_tokens": 7212874.0, "repeat_count": 0.0, "routers_loss": 0.002426684135571122, "skip_count": 1.0, "step": 4938, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.722505462490897, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.08544921875, "learning_rate": 0.0004248488607143058, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7216829.0, "repeat_count": 0.0, "routers_loss": 0.0020447236020118, "skip_count": 1.0, "step": 4940, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.73415877640204, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.0004244654383699129, "loss": 0.0042, "macro_f1": 0.6666666865348816, "num_tokens": 7219213.0, "repeat_count": 0.0, "routers_loss": 0.0021305729169398546, "skip_count": 2.0, "step": 4942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.745812090313184, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.056884765625, "learning_rate": 0.0004240820614756777, "loss": 0.0036, "macro_f1": 0.3333333432674408, "num_tokens": 7222238.0, "repeat_count": 0.0, "routers_loss": 0.0018988613737747073, "skip_count": 0.0, "step": 4944, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.757465404224327, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09033203125, "learning_rate": 0.00042369873026228263, "loss": 0.0043, "macro_f1": 0.6666666865348816, "num_tokens": 7224597.0, "repeat_count": 0.0, "routers_loss": 0.0024105326738208532, "skip_count": 1.0, "step": 4946, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 28.76911871813547, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.162109375, "learning_rate": 0.0004233154449603832, "loss": 0.0052, "macro_f1": 1.0, "num_tokens": 7227674.0, "repeat_count": 2.0, "routers_loss": 0.006754135247319937, "skip_count": 2.0, "step": 4948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.780772032046613, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.1318359375, "learning_rate": 0.00042293220580060767, "loss": 0.0035, "macro_f1": 0.6666666865348816, "num_tokens": 7230547.0, "repeat_count": 0.0, "routers_loss": 0.005115616600960493, "skip_count": 2.0, "step": 4950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 30.0, "epoch": 28.792425345957756, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.060302734375, "learning_rate": 0.0004225490130135557, "loss": 0.0032, "macro_f1": 0.6666666865348816, "num_tokens": 7233784.0, "repeat_count": 0.0, "routers_loss": 0.0074838027358055115, "skip_count": 6.0, "step": 4952, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 38.0, "epoch": 28.8040786598689, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.059814453125, "learning_rate": 0.0004221658668298001, "loss": 0.0059, "macro_f1": 0.6666666865348816, "num_tokens": 7236604.0, "repeat_count": 2.0, "routers_loss": 0.004796371795237064, "skip_count": 0.0, "step": 4954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.815731973780043, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.10888671875, "learning_rate": 0.0004217827674798845, "loss": 0.0036, "macro_f1": 0.6666666865348816, "num_tokens": 7239524.0, "repeat_count": 0.0, "routers_loss": 0.0007758511346764863, "skip_count": 2.0, "step": 4956, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.827385287691186, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.0712890625, "learning_rate": 0.00042139971519432546, "loss": 0.0024, "macro_f1": 1.0, "num_tokens": 7242036.0, "repeat_count": 1.0, "routers_loss": 0.003048182465136051, "skip_count": 2.0, "step": 4958, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 32.0, "epoch": 28.83903860160233, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0859375, "learning_rate": 0.0004210167102036102, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7244788.0, "repeat_count": 0.0, "routers_loss": 0.010201022028923035, "skip_count": 4.0, "step": 4960, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 36.0, "epoch": 28.850691915513472, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.111328125, "learning_rate": 0.0004206337527381981, "loss": 0.0068, "macro_f1": 1.0, "num_tokens": 7248497.0, "repeat_count": 2.0, "routers_loss": 0.006179827265441418, "skip_count": 2.0, "step": 4962, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 28.86234522942462, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.138671875, "learning_rate": 0.0004202508430285199, "loss": 0.0033, "macro_f1": 0.6666666865348816, "num_tokens": 7251113.0, "repeat_count": 1.0, "routers_loss": 0.007138338405638933, "skip_count": 0.0, "step": 4964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.873998543335762, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06591796875, "learning_rate": 0.0004198679813049772, "loss": 0.0039, "macro_f1": 0.3333333432674408, "num_tokens": 7254117.0, "repeat_count": 0.0, "routers_loss": 0.0007116185151971877, "skip_count": 0.0, "step": 4966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.885651857246906, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.0732421875, "learning_rate": 0.0004194851677979436, "loss": 0.0054, "macro_f1": 0.3333333432674408, "num_tokens": 7257450.0, "repeat_count": 0.0, "routers_loss": 0.001931387116201222, "skip_count": 0.0, "step": 4968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.89730517115805, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.06982421875, "learning_rate": 0.0004191024027377624, "loss": 0.0045, "macro_f1": 0.3333333432674408, "num_tokens": 7260287.0, "repeat_count": 0.0, "routers_loss": 0.003119426080957055, "skip_count": 0.0, "step": 4970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.908958485069192, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08935546875, "learning_rate": 0.00041871968635474915, "loss": 0.0049, "macro_f1": 0.3333333432674408, "num_tokens": 7262890.0, "repeat_count": 0.0, "routers_loss": 0.0008562462171539664, "skip_count": 0.0, "step": 4972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.920611798980335, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.07861328125, "learning_rate": 0.00041833701887918903, "loss": 0.0047, "macro_f1": 0.6666666865348816, "num_tokens": 7265753.0, "repeat_count": 0.0, "routers_loss": 0.0029687529895454645, "skip_count": 1.0, "step": 4974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 28.93226511289148, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.123046875, "learning_rate": 0.000417954400541338, "loss": 0.0066, "macro_f1": 0.6666666865348816, "num_tokens": 7268424.0, "repeat_count": 0.0, "routers_loss": 0.0008327155373990536, "skip_count": 1.0, "step": 4976, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.94391842680262, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09375, "learning_rate": 0.0004175718315714232, "loss": 0.0049, "macro_f1": 0.6666666865348816, "num_tokens": 7271178.0, "repeat_count": 0.0, "routers_loss": 0.0006440202705562115, "skip_count": 2.0, "step": 4978, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.955571740713765, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.09228515625, "learning_rate": 0.0004171893121996409, "loss": 0.0063, "macro_f1": 0.3333333432674408, "num_tokens": 7273997.0, "repeat_count": 0.0, "routers_loss": 0.0015520754968747497, "skip_count": 0.0, "step": 4980, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.967225054624908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.11669921875, "learning_rate": 0.0004168068426561587, "loss": 0.003, "macro_f1": 0.6666666865348816, "num_tokens": 7276775.0, "repeat_count": 0.0, "routers_loss": 0.0030974161345511675, "skip_count": 2.0, "step": 4982, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 28.97887836853605, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.087890625, "learning_rate": 0.00041642442317111304, "loss": 0.0042, "macro_f1": 0.3333333432674408, "num_tokens": 7279537.0, "repeat_count": 0.0, "routers_loss": 0.001360819791443646, "skip_count": 0.0, "step": 4984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 28.990531682447195, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.0004160420539746115, "loss": 0.0052, "macro_f1": 0.6666666865348816, "num_tokens": 7282944.0, "repeat_count": 0.0, "routers_loss": 0.001873672823421657, "skip_count": 2.0, "step": 4986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 29.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.10546875, "learning_rate": 0.0004156597352967304, "loss": 0.0033, "macro_f1": 0.3333333432674408, "num_tokens": 7285264.0, "repeat_count": 0.0, "routers_loss": 0.0036546732299029827, "skip_count": 0.0, "step": 4988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 29.011653313911143, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.14453125, "learning_rate": 0.0004152774673675157, "loss": 0.0035, "macro_f1": 0.3333333432674408, "num_tokens": 7288245.0, "repeat_count": 0.0, "routers_loss": 0.0004330816736910492, "skip_count": 0.0, "step": 4990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 29.023306627822286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.09814453125, "learning_rate": 0.00041489525041698384, "loss": 0.004, "macro_f1": 0.6666666865348816, "num_tokens": 7291712.0, "repeat_count": 0.0, "routers_loss": 0.003909943625330925, "skip_count": 1.0, "step": 4992, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 34.0, "epoch": 29.03495994173343, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.09765625, "learning_rate": 0.00041451308467511916, "loss": 0.0047, "macro_f1": 1.0, "num_tokens": 7294306.0, "repeat_count": 1.0, "routers_loss": 0.006706570275127888, "skip_count": 3.0, "step": 4994, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 36.0, "epoch": 29.046613255644573, "f1_execute": 0.98591548204422, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.08740234375, "learning_rate": 0.00041413097037187657, "loss": 0.0057, "macro_f1": 0.32863849401474, "num_tokens": 7297619.0, "repeat_count": 1.0, "routers_loss": 0.025886600837111473, "skip_count": 0.0, "step": 4996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 35.0, "epoch": 29.058266569555716, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.0625, "learning_rate": 0.00041374890773717874, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7300569.0, "repeat_count": 0.0, "routers_loss": 0.0009766226867213845, "skip_count": 1.0, "step": 4998, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 37.0, "epoch": 29.06991988346686, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.119140625, "learning_rate": 0.0004133668970009183, "loss": 0.0044, "macro_f1": 0.6666666865348816, "num_tokens": 7303045.0, "repeat_count": 1.0, "routers_loss": 0.0026601122226566076, "skip_count": 0.0, "step": 5000, "text_loss": 0.0 } ], "logging_steps": 2, "max_steps": 8600, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2200011927187259e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }