{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.436860068259386, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.010921501706484642, "f1_execute": 0.5142857432365417, "f1_repeat": 0.2222222238779068, "f1_skip": 0.0, "grad_norm": 31.125, "learning_rate": 2e-06, "loss": 2.8198, "macro_f1": 0.24550265073776245, "num_tokens": 3507.0, "repeat_count": 1.0, "routers_loss": 1.076732873916626, "skip_count": 2.0, "step": 2, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.021843003412969283, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 38.5, "learning_rate": 6e-06, "loss": 3.125, "macro_f1": 0.222222238779068, "num_tokens": 7330.0, "repeat_count": 0.0, "routers_loss": 4.3143134117126465, "skip_count": 0.0, "step": 4, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.032764505119453925, "f1_execute": 0.5999999642372131, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 33.75, "learning_rate": 1e-05, "loss": 3.0713, "macro_f1": 0.19999998807907104, "num_tokens": 11360.0, "repeat_count": 0.0, "routers_loss": 1.8818678855895996, "skip_count": 0.0, "step": 6, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.04368600682593857, "f1_execute": 0.5789473652839661, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 37.25, "learning_rate": 1.4e-05, "loss": 2.992, "macro_f1": 0.19298246502876282, "num_tokens": 14241.0, "repeat_count": 1.0, "routers_loss": 2.340613603591919, "skip_count": 1.0, "step": 8, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.05460750853242321, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 34.5, "learning_rate": 1.8e-05, "loss": 3.0072, "macro_f1": 0.222222238779068, "num_tokens": 17520.0, "repeat_count": 0.0, "routers_loss": 1.7916433811187744, "skip_count": 0.0, "step": 10, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.06552901023890785, "f1_execute": 0.6315789818763733, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 41.25, "learning_rate": 2.2e-05, "loss": 3.2227, "macro_f1": 0.21052633225917816, "num_tokens": 20401.0, "repeat_count": 1.0, "routers_loss": 2.2361459732055664, "skip_count": 1.0, "step": 12, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 0.07645051194539249, "f1_execute": 0.5789473652839661, "f1_repeat": 0.0, "f1_skip": 0.20000000298023224, "grad_norm": 31.875, "learning_rate": 2.6e-05, "loss": 3.1809, "macro_f1": 0.2596491277217865, "num_tokens": 23722.0, "repeat_count": 1.0, "routers_loss": 2.6635637283325195, "skip_count": 2.0, "step": 14, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.08737201365187713, "f1_execute": 0.6341463327407837, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 40.25, "learning_rate": 3e-05, "loss": 3.2606, "macro_f1": 0.21138212084770203, "num_tokens": 26754.0, "repeat_count": 0.0, "routers_loss": 1.967104196548462, "skip_count": 0.0, "step": 16, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 0.09829351535836177, "f1_execute": 0.5405405163764954, "f1_repeat": 0.0, "f1_skip": 0.1666666567325592, "grad_norm": 39.5, "learning_rate": 3.4000000000000007e-05, "loss": 2.9096, "macro_f1": 0.23573574423789978, "num_tokens": 29878.0, "repeat_count": 0.0, "routers_loss": 0.6965824365615845, "skip_count": 2.0, "step": 18, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.10921501706484642, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 40.75, "learning_rate": 3.8e-05, "loss": 3.2996, "macro_f1": 0.222222238779068, "num_tokens": 32410.0, "repeat_count": 0.0, "routers_loss": 7.038887977600098, "skip_count": 0.0, "step": 20, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.12013651877133105, "f1_execute": 0.5641025900840759, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 32.5, "learning_rate": 4.2000000000000004e-05, "loss": 2.7437, "macro_f1": 0.18803420662879944, "num_tokens": 35122.0, "repeat_count": 1.0, "routers_loss": 4.3931450843811035, "skip_count": 2.0, "step": 22, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.1310580204778157, "f1_execute": 0.6341463327407837, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 44.0, "learning_rate": 4.6e-05, "loss": 2.9583, "macro_f1": 0.21138212084770203, "num_tokens": 38647.0, "repeat_count": 0.0, "routers_loss": 5.246743202209473, "skip_count": 2.0, "step": 24, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.14197952218430035, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 36.0, "learning_rate": 5e-05, "loss": 2.0258, "macro_f1": 0.222222238779068, "num_tokens": 41759.0, "repeat_count": 0.0, "routers_loss": 4.385664463043213, "skip_count": 0.0, "step": 26, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.15290102389078497, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 22.75, "learning_rate": 5.4e-05, "loss": 1.8932, "macro_f1": 0.222222238779068, "num_tokens": 45255.0, "repeat_count": 1.0, "routers_loss": 2.442974090576172, "skip_count": 2.0, "step": 28, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.16382252559726962, "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 20.5, "learning_rate": 5.800000000000001e-05, "loss": 1.5961, "macro_f1": 0.24242423474788666, "num_tokens": 48765.0, "repeat_count": 0.0, "routers_loss": 1.319467306137085, "skip_count": 3.0, "step": 30, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.17474402730375427, "f1_execute": 0.782608687877655, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 21.875, "learning_rate": 6.2e-05, "loss": 1.7529, "macro_f1": 0.260869562625885, "num_tokens": 51973.0, "repeat_count": 0.0, "routers_loss": 1.2047386169433594, "skip_count": 2.0, "step": 32, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.18566552901023892, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 26.875, "learning_rate": 6.6e-05, "loss": 1.4983, "macro_f1": 0.29333335161209106, "num_tokens": 54972.0, "repeat_count": 0.0, "routers_loss": 0.8216792345046997, "skip_count": 0.0, "step": 34, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.19658703071672354, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 20.75, "learning_rate": 7.000000000000001e-05, "loss": 1.2751, "macro_f1": 0.3076923191547394, "num_tokens": 58134.0, "repeat_count": 0.0, "routers_loss": 0.6534898281097412, "skip_count": 0.0, "step": 36, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2075085324232082, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 17.75, "learning_rate": 7.4e-05, "loss": 0.9561, "macro_f1": 0.29333335161209106, "num_tokens": 61291.0, "repeat_count": 0.0, "routers_loss": 0.6772168278694153, "skip_count": 2.0, "step": 38, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.21843003412969283, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.875, "learning_rate": 7.8e-05, "loss": 0.6809, "macro_f1": 0.307692289352417, "num_tokens": 64406.0, "repeat_count": 0.0, "routers_loss": 0.7885609865188599, "skip_count": 1.0, "step": 40, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.22935153583617748, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 17.0, "learning_rate": 8.2e-05, "loss": 0.587, "macro_f1": 0.3205128312110901, "num_tokens": 67402.0, "repeat_count": 1.0, "routers_loss": 0.31721553206443787, "skip_count": 0.0, "step": 42, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2402730375426621, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.625, "learning_rate": 8.599999999999999e-05, "loss": 0.4996, "macro_f1": 0.32098764181137085, "num_tokens": 70935.0, "repeat_count": 0.0, "routers_loss": 0.13094936311244965, "skip_count": 0.0, "step": 44, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.25119453924914675, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.5625, "learning_rate": 8.999999999999999e-05, "loss": 0.4226, "macro_f1": 0.29333335161209106, "num_tokens": 73716.0, "repeat_count": 2.0, "routers_loss": 0.48597365617752075, "skip_count": 3.0, "step": 46, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2621160409556314, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.1875, "learning_rate": 9.400000000000001e-05, "loss": 0.2499, "macro_f1": 0.31446540355682373, "num_tokens": 76662.0, "repeat_count": 0.0, "routers_loss": 0.7850716710090637, "skip_count": 1.0, "step": 48, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.27303754266211605, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5625, "learning_rate": 9.800000000000001e-05, "loss": 0.3029, "macro_f1": 0.3144654333591461, "num_tokens": 80080.0, "repeat_count": 2.0, "routers_loss": 1.4728330373764038, "skip_count": 1.0, "step": 50, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2839590443686007, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0625, "learning_rate": 0.000102, "loss": 0.2549, "macro_f1": 0.32098764181137085, "num_tokens": 82942.0, "repeat_count": 0.0, "routers_loss": 0.16784702241420746, "skip_count": 2.0, "step": 52, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.29488054607508535, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.000106, "loss": 0.2782, "macro_f1": 0.2857142686843872, "num_tokens": 85928.0, "repeat_count": 1.0, "routers_loss": 0.25518977642059326, "skip_count": 4.0, "step": 54, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.30580204778156994, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.75, "learning_rate": 0.00011, "loss": 0.2309, "macro_f1": 0.307692289352417, "num_tokens": 88804.0, "repeat_count": 0.0, "routers_loss": 0.21613653004169464, "skip_count": 3.0, "step": 56, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.3167235494880546, "f1_execute": 0.8571429252624512, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.625, "learning_rate": 0.000114, "loss": 0.1319, "macro_f1": 0.285714328289032, "num_tokens": 91674.0, "repeat_count": 1.0, "routers_loss": 0.4971294403076172, "skip_count": 5.0, "step": 58, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.32764505119453924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.000118, "loss": 0.1637, "macro_f1": 0.3333333432674408, "num_tokens": 94858.0, "repeat_count": 0.0, "routers_loss": 0.01838197372853756, "skip_count": 0.0, "step": 60, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.3385665529010239, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.000122, "loss": 0.1888, "macro_f1": 0.31446540355682373, "num_tokens": 97538.0, "repeat_count": 1.0, "routers_loss": 0.5383598804473877, "skip_count": 1.0, "step": 62, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 0.34948805460750854, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.000126, "loss": 0.2176, "macro_f1": 0.2857142686843872, "num_tokens": 101249.0, "repeat_count": 1.0, "routers_loss": 0.2093856781721115, "skip_count": 1.0, "step": 64, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3604095563139932, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.00013000000000000002, "loss": 0.1568, "macro_f1": 0.3333333432674408, "num_tokens": 104398.0, "repeat_count": 0.0, "routers_loss": 0.015723152086138725, "skip_count": 0.0, "step": 66, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.37133105802047783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.000134, "loss": 0.2764, "macro_f1": 0.3333333432674408, "num_tokens": 107538.0, "repeat_count": 0.0, "routers_loss": 0.019146224483847618, "skip_count": 0.0, "step": 68, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3822525597269625, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.25, "learning_rate": 0.00013800000000000002, "loss": 0.2035, "macro_f1": 0.3144654333591461, "num_tokens": 110689.0, "repeat_count": 3.0, "routers_loss": 0.6408394575119019, "skip_count": 0.0, "step": 70, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.3931740614334471, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.6875, "learning_rate": 0.00014199999999999998, "loss": 0.1986, "macro_f1": 0.32098764181137085, "num_tokens": 114205.0, "repeat_count": 0.0, "routers_loss": 0.04342689737677574, "skip_count": 0.0, "step": 72, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.4040955631399317, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0625, "learning_rate": 0.000146, "loss": 0.1412, "macro_f1": 0.307692289352417, "num_tokens": 117140.0, "repeat_count": 0.0, "routers_loss": 0.12777170538902283, "skip_count": 1.0, "step": 74, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.4150170648464164, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.5, "learning_rate": 0.00015, "loss": 0.1273, "macro_f1": 0.2857142686843872, "num_tokens": 120355.0, "repeat_count": 0.0, "routers_loss": 0.2570268511772156, "skip_count": 5.0, "step": 76, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.03125, "learning_rate": 0.000154, "loss": 0.1169, "macro_f1": 0.3333333432674408, "num_tokens": 123542.0, "repeat_count": 0.0, "routers_loss": 0.019178830087184906, "skip_count": 0.0, "step": 78, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.43686006825938567, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.000158, "loss": 0.1702, "macro_f1": 0.3006536066532135, "num_tokens": 126444.0, "repeat_count": 0.0, "routers_loss": 0.40678197145462036, "skip_count": 4.0, "step": 80, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4477815699658703, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.40625, "learning_rate": 0.000162, "loss": 0.207, "macro_f1": 0.3333333432674408, "num_tokens": 129208.0, "repeat_count": 0.0, "routers_loss": 0.016020173206925392, "skip_count": 0.0, "step": 82, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.45870307167235497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0, "learning_rate": 0.00016600000000000002, "loss": 0.1469, "macro_f1": 0.3333333432674408, "num_tokens": 132692.0, "repeat_count": 0.0, "routers_loss": 0.015191584825515747, "skip_count": 0.0, "step": 84, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.4696245733788396, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.9375, "learning_rate": 0.00017, "loss": 0.1883, "macro_f1": 0.307692289352417, "num_tokens": 135433.0, "repeat_count": 1.0, "routers_loss": 0.29757800698280334, "skip_count": 2.0, "step": 86, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.1111111119389534, "avg_layers": 27.0, "epoch": 0.4805460750853242, "f1_execute": 0.7142857313156128, "f1_repeat": 0.0, "f1_skip": 0.1818181872367859, "grad_norm": 4.21875, "learning_rate": 0.000174, "loss": 0.2656, "macro_f1": 0.29870131611824036, "num_tokens": 139019.0, "repeat_count": 2.0, "routers_loss": 0.5406635403633118, "skip_count": 9.0, "step": 88, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.49146757679180886, "f1_execute": 0.8571429252624512, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.000178, "loss": 0.2149, "macro_f1": 0.285714328289032, "num_tokens": 142156.0, "repeat_count": 3.0, "routers_loss": 0.9084331393241882, "skip_count": 3.0, "step": 90, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 0.5023890784982935, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.15625, "learning_rate": 0.000182, "loss": 0.1461, "macro_f1": 0.4104308784008026, "num_tokens": 144866.0, "repeat_count": 1.0, "routers_loss": 0.298293799161911, "skip_count": 3.0, "step": 92, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5133105802047782, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.59375, "learning_rate": 0.000186, "loss": 0.1432, "macro_f1": 0.32098764181137085, "num_tokens": 148029.0, "repeat_count": 1.0, "routers_loss": 0.13971005380153656, "skip_count": 1.0, "step": 94, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5242320819112628, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.00019, "loss": 0.1566, "macro_f1": 0.32098764181137085, "num_tokens": 151076.0, "repeat_count": 0.0, "routers_loss": 0.2203323394060135, "skip_count": 2.0, "step": 96, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5351535836177475, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.25, "learning_rate": 0.000194, "loss": 0.3221, "macro_f1": 0.32098764181137085, "num_tokens": 153825.0, "repeat_count": 0.0, "routers_loss": 0.22957128286361694, "skip_count": 2.0, "step": 98, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.5460750853242321, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.00019800000000000002, "loss": 0.1445, "macro_f1": 0.3272727429866791, "num_tokens": 157200.0, "repeat_count": 0.0, "routers_loss": 0.0985352173447609, "skip_count": 0.0, "step": 100, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5569965870307167, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.000202, "loss": 0.2346, "macro_f1": 0.3144654333591461, "num_tokens": 161171.0, "repeat_count": 1.0, "routers_loss": 0.5728805065155029, "skip_count": 2.0, "step": 102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 0.5679180887372014, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 4.65625, "learning_rate": 0.000206, "loss": 0.1532, "macro_f1": 0.4871794879436493, "num_tokens": 165319.0, "repeat_count": 0.0, "routers_loss": 0.08763546496629715, "skip_count": 2.0, "step": 104, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.578839590443686, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.375, "learning_rate": 0.00021, "loss": 0.1183, "macro_f1": 0.3272727429866791, "num_tokens": 168259.0, "repeat_count": 0.0, "routers_loss": 0.11700262129306793, "skip_count": 1.0, "step": 106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5897610921501707, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.1875, "learning_rate": 0.000214, "loss": 0.1856, "macro_f1": 0.3144654333591461, "num_tokens": 171640.0, "repeat_count": 1.0, "routers_loss": 0.2897156774997711, "skip_count": 2.0, "step": 108, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.6006825938566553, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000218, "loss": 0.1379, "macro_f1": 0.3006536066532135, "num_tokens": 174452.0, "repeat_count": 0.0, "routers_loss": 0.20764203369617462, "skip_count": 4.0, "step": 110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6116040955631399, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.9375, "learning_rate": 0.000222, "loss": 0.14, "macro_f1": 0.32098764181137085, "num_tokens": 177034.0, "repeat_count": 0.0, "routers_loss": 0.07773401588201523, "skip_count": 0.0, "step": 112, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.6225255972696245, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.375, "learning_rate": 0.00022600000000000002, "loss": 0.1327, "macro_f1": 0.2857142984867096, "num_tokens": 180310.0, "repeat_count": 2.0, "routers_loss": 0.3696478605270386, "skip_count": 2.0, "step": 114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6334470989761092, "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.984375, "learning_rate": 0.00023, "loss": 0.155, "macro_f1": 0.2777777910232544, "num_tokens": 182835.0, "repeat_count": 3.0, "routers_loss": 0.5024136304855347, "skip_count": 5.0, "step": 116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6443686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.00023400000000000002, "loss": 0.1566, "macro_f1": 0.3333333432674408, "num_tokens": 186508.0, "repeat_count": 0.0, "routers_loss": 0.02631981112062931, "skip_count": 0.0, "step": 118, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6552901023890785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.00023799999999999998, "loss": 0.1503, "macro_f1": 0.32098764181137085, "num_tokens": 190380.0, "repeat_count": 0.0, "routers_loss": 0.036612559109926224, "skip_count": 0.0, "step": 120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.6662116040955631, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.28125, "learning_rate": 0.000242, "loss": 0.181, "macro_f1": 0.3076923191547394, "num_tokens": 193279.0, "repeat_count": 1.0, "routers_loss": 0.37753066420555115, "skip_count": 1.0, "step": 122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.6771331058020478, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.75, "learning_rate": 0.000246, "loss": 0.1187, "macro_f1": 0.32098767161369324, "num_tokens": 196711.0, "repeat_count": 0.0, "routers_loss": 0.08419940620660782, "skip_count": 1.0, "step": 124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 0.6880546075085324, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.0, "learning_rate": 0.00025, "loss": 0.1184, "macro_f1": 0.5492662787437439, "num_tokens": 199715.0, "repeat_count": 0.0, "routers_loss": 0.043020736426115036, "skip_count": 2.0, "step": 126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.6989761092150171, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.000254, "loss": 0.1421, "macro_f1": 0.32098767161369324, "num_tokens": 204217.0, "repeat_count": 0.0, "routers_loss": 0.0802314504981041, "skip_count": 1.0, "step": 128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7098976109215017, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0, "learning_rate": 0.00025800000000000004, "loss": 0.1719, "macro_f1": 0.32098764181137085, "num_tokens": 206777.0, "repeat_count": 1.0, "routers_loss": 0.09076520055532455, "skip_count": 1.0, "step": 130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.7208191126279864, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.9375, "learning_rate": 0.000262, "loss": 0.1423, "macro_f1": 0.3272727429866791, "num_tokens": 210838.0, "repeat_count": 0.0, "routers_loss": 0.024340573698282242, "skip_count": 0.0, "step": 132, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.731740614334471, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.875, "learning_rate": 0.000266, "loss": 0.1, "macro_f1": 0.3333333432674408, "num_tokens": 213498.0, "repeat_count": 0.0, "routers_loss": 0.016322199255228043, "skip_count": 0.0, "step": 134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7426621160409557, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.00027, "loss": 0.1408, "macro_f1": 0.3272727429866791, "num_tokens": 216998.0, "repeat_count": 0.0, "routers_loss": 0.042806077748537064, "skip_count": 1.0, "step": 136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7535836177474403, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.6875, "learning_rate": 0.00027400000000000005, "loss": 0.1012, "macro_f1": 0.32098764181137085, "num_tokens": 219952.0, "repeat_count": 0.0, "routers_loss": 0.12166574597358704, "skip_count": 2.0, "step": 138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.764505119453925, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.59375, "learning_rate": 0.00027800000000000004, "loss": 0.1576, "macro_f1": 0.32098767161369324, "num_tokens": 223326.0, "repeat_count": 0.0, "routers_loss": 0.12389889359474182, "skip_count": 1.0, "step": 140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.7754266211604095, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.46875, "learning_rate": 0.00028199999999999997, "loss": 0.1554, "macro_f1": 0.31446540355682373, "num_tokens": 226179.0, "repeat_count": 0.0, "routers_loss": 0.1315135806798935, "skip_count": 2.0, "step": 142, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7863481228668942, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.625, "learning_rate": 0.00028599999999999996, "loss": 0.1188, "macro_f1": 0.3272727429866791, "num_tokens": 228782.0, "repeat_count": 0.0, "routers_loss": 0.08095238357782364, "skip_count": 1.0, "step": 144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7972696245733788, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5, "learning_rate": 0.00029, "loss": 0.1616, "macro_f1": 0.3076923191547394, "num_tokens": 231771.0, "repeat_count": 0.0, "routers_loss": 0.13997994363307953, "skip_count": 4.0, "step": 146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8081911262798634, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0, "learning_rate": 0.000294, "loss": 0.1868, "macro_f1": 0.3333333432674408, "num_tokens": 234517.0, "repeat_count": 0.0, "routers_loss": 0.03245344012975693, "skip_count": 0.0, "step": 148, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.8191126279863481, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.375, "learning_rate": 0.000298, "loss": 0.148, "macro_f1": 0.3006536066532135, "num_tokens": 237324.0, "repeat_count": 1.0, "routers_loss": 0.36887046694755554, "skip_count": 2.0, "step": 150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8300341296928327, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.000302, "loss": 0.1759, "macro_f1": 0.3272727429866791, "num_tokens": 240657.0, "repeat_count": 1.0, "routers_loss": 0.1363309770822525, "skip_count": 0.0, "step": 152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8409556313993174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.65625, "learning_rate": 0.000306, "loss": 0.2043, "macro_f1": 0.3333333432674408, "num_tokens": 243741.0, "repeat_count": 0.0, "routers_loss": 0.024881718680262566, "skip_count": 0.0, "step": 154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 0.851877133105802, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 4.5625, "learning_rate": 0.00031, "loss": 0.1777, "macro_f1": 0.4326530694961548, "num_tokens": 246879.0, "repeat_count": 1.0, "routers_loss": 0.25227662920951843, "skip_count": 3.0, "step": 156, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 0.8627986348122867, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.28125, "learning_rate": 0.000314, "loss": 0.1641, "macro_f1": 0.47333335876464844, "num_tokens": 249880.0, "repeat_count": 2.0, "routers_loss": 0.3088915944099426, "skip_count": 3.0, "step": 158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 0.8737201365187713, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.59375, "learning_rate": 0.00031800000000000003, "loss": 0.1687, "macro_f1": 0.41777777671813965, "num_tokens": 252725.0, "repeat_count": 0.0, "routers_loss": 0.11272747814655304, "skip_count": 3.0, "step": 160, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.884641638225256, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.90625, "learning_rate": 0.000322, "loss": 0.1408, "macro_f1": 0.3144654333591461, "num_tokens": 255951.0, "repeat_count": 0.0, "routers_loss": 0.05064187943935394, "skip_count": 0.0, "step": 162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8955631399317406, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.65625, "learning_rate": 0.000326, "loss": 0.1509, "macro_f1": 0.3076923191547394, "num_tokens": 259469.0, "repeat_count": 0.0, "routers_loss": 0.21262036263942719, "skip_count": 2.0, "step": 164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 0.9064846416382253, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.25, "learning_rate": 0.00033, "loss": 0.1578, "macro_f1": 0.4400000274181366, "num_tokens": 262272.0, "repeat_count": 1.0, "routers_loss": 0.1725386530160904, "skip_count": 3.0, "step": 166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.9174061433447099, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.4375, "learning_rate": 0.00033400000000000004, "loss": 0.1471, "macro_f1": 0.3272727429866791, "num_tokens": 266415.0, "repeat_count": 0.0, "routers_loss": 0.02629087306559086, "skip_count": 0.0, "step": 168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.9283276450511946, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.00033800000000000003, "loss": 0.1185, "macro_f1": 0.32098767161369324, "num_tokens": 269700.0, "repeat_count": 0.0, "routers_loss": 0.05510875955224037, "skip_count": 1.0, "step": 170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.9392491467576792, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.75, "learning_rate": 0.000342, "loss": 0.1637, "macro_f1": 0.3006536066532135, "num_tokens": 272587.0, "repeat_count": 1.0, "routers_loss": 0.27733829617500305, "skip_count": 3.0, "step": 172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9501706484641638, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.000346, "loss": 0.2034, "macro_f1": 0.32098764181137085, "num_tokens": 277005.0, "repeat_count": 0.0, "routers_loss": 0.14457301795482635, "skip_count": 2.0, "step": 174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 0.9610921501706484, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.125, "learning_rate": 0.00035, "loss": 0.154, "macro_f1": 0.4871794879436493, "num_tokens": 279607.0, "repeat_count": 0.0, "routers_loss": 0.07571296393871307, "skip_count": 2.0, "step": 176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.9720136518771331, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000354, "loss": 0.1894, "macro_f1": 0.32098767161369324, "num_tokens": 282547.0, "repeat_count": 1.0, "routers_loss": 0.5549371838569641, "skip_count": 0.0, "step": 178, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.9829351535836177, "f1_execute": 0.9411765336990356, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.000358, "loss": 0.1226, "macro_f1": 0.5359477400779724, "num_tokens": 286081.0, "repeat_count": 2.0, "routers_loss": 0.2509016990661621, "skip_count": 2.0, "step": 180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.9938566552901024, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.000362, "loss": 0.1795, "macro_f1": 0.3272727429866791, "num_tokens": 289224.0, "repeat_count": 0.0, "routers_loss": 0.017457736656069756, "skip_count": 0.0, "step": 182, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.0, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.390625, "learning_rate": 0.000366, "loss": 0.1471, "macro_f1": 0.3272727429866791, "num_tokens": 290916.0, "repeat_count": 0.0, "routers_loss": 0.05112108215689659, "skip_count": 0.0, "step": 184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0109215017064845, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.6875, "learning_rate": 0.00037, "loss": 0.1459, "macro_f1": 0.3076923191547394, "num_tokens": 294182.0, "repeat_count": 3.0, "routers_loss": 0.5592358708381653, "skip_count": 1.0, "step": 186, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0218430034129693, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.000374, "loss": 0.1446, "macro_f1": 0.3333333432674408, "num_tokens": 296702.0, "repeat_count": 0.0, "routers_loss": 0.006012737285345793, "skip_count": 0.0, "step": 188, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.0327645051194538, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.96875, "learning_rate": 0.000378, "loss": 0.1394, "macro_f1": 0.31446540355682373, "num_tokens": 300348.0, "repeat_count": 0.0, "routers_loss": 0.06094537675380707, "skip_count": 2.0, "step": 190, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0436860068259386, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.625, "learning_rate": 0.000382, "loss": 0.0995, "macro_f1": 0.3272727429866791, "num_tokens": 303466.0, "repeat_count": 0.0, "routers_loss": 0.08475696295499802, "skip_count": 1.0, "step": 192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0546075085324231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.1875, "learning_rate": 0.000386, "loss": 0.1749, "macro_f1": 0.3333333432674408, "num_tokens": 306160.0, "repeat_count": 0.0, "routers_loss": 0.010187637060880661, "skip_count": 0.0, "step": 194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.065529010238908, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.28125, "learning_rate": 0.00039000000000000005, "loss": 0.1692, "macro_f1": 0.3076923191547394, "num_tokens": 309453.0, "repeat_count": 1.0, "routers_loss": 0.20142780244350433, "skip_count": 1.0, "step": 196, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0764505119453924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.00039400000000000004, "loss": 0.1283, "macro_f1": 0.3333333432674408, "num_tokens": 312138.0, "repeat_count": 0.0, "routers_loss": 0.015577984042465687, "skip_count": 0.0, "step": 198, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.0873720136518772, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.46875, "learning_rate": 0.000398, "loss": 0.1061, "macro_f1": 0.4803921580314636, "num_tokens": 315833.0, "repeat_count": 0.0, "routers_loss": 0.1465342938899994, "skip_count": 2.0, "step": 200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.0982935153583617, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.5625, "learning_rate": 0.000402, "loss": 0.1879, "macro_f1": 0.32098764181137085, "num_tokens": 318690.0, "repeat_count": 0.0, "routers_loss": 0.09964372962713242, "skip_count": 0.0, "step": 202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 1.1092150170648465, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.25, "learning_rate": 0.00040600000000000006, "loss": 0.1226, "macro_f1": 0.32098764181137085, "num_tokens": 322294.0, "repeat_count": 0.0, "routers_loss": 0.030282732099294662, "skip_count": 0.0, "step": 204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.120136518771331, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.00041, "loss": 0.1582, "macro_f1": 0.32098767161369324, "num_tokens": 325029.0, "repeat_count": 0.0, "routers_loss": 0.24788229167461395, "skip_count": 1.0, "step": 206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 1.1310580204778158, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.9375, "learning_rate": 0.000414, "loss": 0.2048, "macro_f1": 0.4871794879436493, "num_tokens": 328178.0, "repeat_count": 0.0, "routers_loss": 0.031264692544937134, "skip_count": 1.0, "step": 208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 1.1419795221843003, "f1_execute": 0.9166666269302368, "f1_repeat": 0.0, "f1_skip": 0.5714285373687744, "grad_norm": 6.8125, "learning_rate": 0.00041799999999999997, "loss": 0.1756, "macro_f1": 0.4960317313671112, "num_tokens": 331351.0, "repeat_count": 1.0, "routers_loss": 0.343823105096817, "skip_count": 4.0, "step": 210, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1529010238907849, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.84375, "learning_rate": 0.000422, "loss": 0.1246, "macro_f1": 0.3333333432674408, "num_tokens": 335297.0, "repeat_count": 0.0, "routers_loss": 0.014860679395496845, "skip_count": 0.0, "step": 212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.1638225255972696, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.46875, "learning_rate": 0.000426, "loss": 0.1537, "macro_f1": 0.3006536066532135, "num_tokens": 338427.0, "repeat_count": 1.0, "routers_loss": 0.33231568336486816, "skip_count": 3.0, "step": 214, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1747440273037544, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.59375, "learning_rate": 0.00043, "loss": 0.1546, "macro_f1": 0.3333333432674408, "num_tokens": 341158.0, "repeat_count": 0.0, "routers_loss": 0.007448212709277868, "skip_count": 0.0, "step": 216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.185665529010239, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.28125, "learning_rate": 0.00043400000000000003, "loss": 0.1468, "macro_f1": 0.3272727429866791, "num_tokens": 344329.0, "repeat_count": 0.0, "routers_loss": 0.02311822399497032, "skip_count": 0.0, "step": 218, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.1965870307167235, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.000438, "loss": 0.1307, "macro_f1": 0.32098767161369324, "num_tokens": 348948.0, "repeat_count": 0.0, "routers_loss": 0.02867077849805355, "skip_count": 1.0, "step": 220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.2075085324232082, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.96875, "learning_rate": 0.000442, "loss": 0.2046, "macro_f1": 0.5492662787437439, "num_tokens": 351741.0, "repeat_count": 0.0, "routers_loss": 0.03160649910569191, "skip_count": 2.0, "step": 222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2184300341296928, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.000446, "loss": 0.2074, "macro_f1": 0.3272727429866791, "num_tokens": 354852.0, "repeat_count": 1.0, "routers_loss": 0.1611160784959793, "skip_count": 0.0, "step": 224, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 1.2293515358361775, "f1_execute": 0.8695651888847351, "f1_repeat": 0.4000000059604645, "f1_skip": 0.4000000059604645, "grad_norm": 3.328125, "learning_rate": 0.00045000000000000004, "loss": 0.118, "macro_f1": 0.5565217733383179, "num_tokens": 357431.0, "repeat_count": 2.0, "routers_loss": 0.7632720470428467, "skip_count": 3.0, "step": 226, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.240273037542662, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.59375, "learning_rate": 0.00045400000000000003, "loss": 0.0965, "macro_f1": 0.32098767161369324, "num_tokens": 360192.0, "repeat_count": 0.0, "routers_loss": 0.08349918574094772, "skip_count": 1.0, "step": 228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 1.2511945392491468, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.9375, "learning_rate": 0.000458, "loss": 0.1714, "macro_f1": 0.4871794879436493, "num_tokens": 363209.0, "repeat_count": 0.0, "routers_loss": 0.06626693904399872, "skip_count": 2.0, "step": 230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.2621160409556313, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.25, "learning_rate": 0.000462, "loss": 0.1859, "macro_f1": 0.3272727429866791, "num_tokens": 368262.0, "repeat_count": 0.0, "routers_loss": 0.03743857145309448, "skip_count": 0.0, "step": 232, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.273037542662116, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.34375, "learning_rate": 0.00046600000000000005, "loss": 0.2281, "macro_f1": 0.31446540355682373, "num_tokens": 370737.0, "repeat_count": 1.0, "routers_loss": 0.12340149283409119, "skip_count": 0.0, "step": 234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.2839590443686006, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.8125, "learning_rate": 0.00047, "loss": 0.1535, "macro_f1": 0.32098764181137085, "num_tokens": 373272.0, "repeat_count": 0.0, "routers_loss": 0.04501926526427269, "skip_count": 0.0, "step": 236, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.2948805460750854, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.625, "learning_rate": 0.000474, "loss": 0.1701, "macro_f1": 0.3076923191547394, "num_tokens": 376924.0, "repeat_count": 1.0, "routers_loss": 0.3543643057346344, "skip_count": 1.0, "step": 238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 1.30580204778157, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.78125, "learning_rate": 0.00047799999999999996, "loss": 0.1553, "macro_f1": 0.4400000274181366, "num_tokens": 380034.0, "repeat_count": 1.0, "routers_loss": 0.1332877278327942, "skip_count": 4.0, "step": 240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3167235494880547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.125, "learning_rate": 0.000482, "loss": 0.0874, "macro_f1": 0.3333333432674408, "num_tokens": 382846.0, "repeat_count": 0.0, "routers_loss": 0.013933669775724411, "skip_count": 0.0, "step": 242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3276450511945392, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.765625, "learning_rate": 0.000486, "loss": 0.1505, "macro_f1": 0.3272727429866791, "num_tokens": 385916.0, "repeat_count": 0.0, "routers_loss": 0.11566327512264252, "skip_count": 1.0, "step": 244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.3385665529010238, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.9375, "learning_rate": 0.00049, "loss": 0.1634, "macro_f1": 0.3272727429866791, "num_tokens": 388768.0, "repeat_count": 0.0, "routers_loss": 0.015394577756524086, "skip_count": 0.0, "step": 246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.3494880546075085, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.78125, "learning_rate": 0.000494, "loss": 0.1493, "macro_f1": 0.32098764181137085, "num_tokens": 391699.0, "repeat_count": 0.0, "routers_loss": 0.05529753863811493, "skip_count": 0.0, "step": 248, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.3604095563139933, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.75, "learning_rate": 0.000498, "loss": 0.2545, "macro_f1": 0.31446540355682373, "num_tokens": 395380.0, "repeat_count": 1.0, "routers_loss": 0.15498189628124237, "skip_count": 1.0, "step": 250, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.3713310580204778, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.8125, "learning_rate": 0.0005020000000000001, "loss": 0.1998, "macro_f1": 0.31446540355682373, "num_tokens": 398414.0, "repeat_count": 0.0, "routers_loss": 0.053408559411764145, "skip_count": 2.0, "step": 252, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.3822525597269624, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.000506, "loss": 0.1761, "macro_f1": 0.31446540355682373, "num_tokens": 401690.0, "repeat_count": 0.0, "routers_loss": 0.15143637359142303, "skip_count": 1.0, "step": 254, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3931740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.796875, "learning_rate": 0.00051, "loss": 0.1638, "macro_f1": 0.3272727429866791, "num_tokens": 404533.0, "repeat_count": 0.0, "routers_loss": 0.036931805312633514, "skip_count": 1.0, "step": 256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.4040955631399317, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 7.21875, "learning_rate": 0.000514, "loss": 0.1765, "macro_f1": 0.5427350401878357, "num_tokens": 408175.0, "repeat_count": 1.0, "routers_loss": 0.16898785531520844, "skip_count": 2.0, "step": 258, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 1.4150170648464164, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.875, "learning_rate": 0.000518, "loss": 0.2172, "macro_f1": 0.4871794879436493, "num_tokens": 411160.0, "repeat_count": 0.0, "routers_loss": 0.05883602425456047, "skip_count": 1.0, "step": 260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.425938566552901, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.453125, "learning_rate": 0.000522, "loss": 0.1121, "macro_f1": 0.31446540355682373, "num_tokens": 414391.0, "repeat_count": 0.0, "routers_loss": 0.14810606837272644, "skip_count": 2.0, "step": 262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4368600682593857, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.000526, "loss": 0.1772, "macro_f1": 0.3272727429866791, "num_tokens": 417763.0, "repeat_count": 1.0, "routers_loss": 0.20452100038528442, "skip_count": 0.0, "step": 264, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 1.4477815699658703, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 3.5, "learning_rate": 0.0005300000000000001, "loss": 0.1446, "macro_f1": 0.4326530694961548, "num_tokens": 421881.0, "repeat_count": 2.0, "routers_loss": 0.32300108671188354, "skip_count": 3.0, "step": 266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 27.0, "epoch": 1.458703071672355, "f1_execute": 0.8260869383811951, "f1_repeat": 0.0, "f1_skip": 0.2857142984867096, "grad_norm": 3.96875, "learning_rate": 0.0005340000000000001, "loss": 0.1377, "macro_f1": 0.3706004321575165, "num_tokens": 424938.0, "repeat_count": 2.0, "routers_loss": 0.5530142784118652, "skip_count": 5.0, "step": 268, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.4696245733788396, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5625, "learning_rate": 0.0005380000000000001, "loss": 0.1457, "macro_f1": 0.307692289352417, "num_tokens": 427555.0, "repeat_count": 0.0, "routers_loss": 0.10682675242424011, "skip_count": 3.0, "step": 270, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.480546075085324, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.0005420000000000001, "loss": 0.174, "macro_f1": 0.3144654333591461, "num_tokens": 430168.0, "repeat_count": 1.0, "routers_loss": 0.9753395318984985, "skip_count": 2.0, "step": 272, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4914675767918089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.75, "learning_rate": 0.000546, "loss": 0.1441, "macro_f1": 0.3333333432674408, "num_tokens": 433358.0, "repeat_count": 0.0, "routers_loss": 0.021224403753876686, "skip_count": 0.0, "step": 274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5023890784982936, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.78125, "learning_rate": 0.00055, "loss": 0.1624, "macro_f1": 0.32098764181137085, "num_tokens": 436460.0, "repeat_count": 0.0, "routers_loss": 0.08185791224241257, "skip_count": 2.0, "step": 276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 1.5133105802047782, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.40625, "learning_rate": 0.000554, "loss": 0.1677, "macro_f1": 0.3144654333591461, "num_tokens": 439531.0, "repeat_count": 0.0, "routers_loss": 0.037240445613861084, "skip_count": 0.0, "step": 278, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5242320819112627, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.765625, "learning_rate": 0.000558, "loss": 0.2688, "macro_f1": 0.3006536066532135, "num_tokens": 442521.0, "repeat_count": 1.0, "routers_loss": 0.3406132459640503, "skip_count": 3.0, "step": 280, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5351535836177475, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.0005620000000000001, "loss": 0.0875, "macro_f1": 0.3333333432674408, "num_tokens": 444942.0, "repeat_count": 0.0, "routers_loss": 0.006758399773389101, "skip_count": 0.0, "step": 282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5460750853242322, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000566, "loss": 0.1597, "macro_f1": 0.3144654333591461, "num_tokens": 448193.0, "repeat_count": 0.0, "routers_loss": 0.06801790744066238, "skip_count": 0.0, "step": 284, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 27.0, "epoch": 1.5569965870307167, "f1_execute": 0.8510637879371643, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 4.78125, "learning_rate": 0.00057, "loss": 0.2027, "macro_f1": 0.39479905366897583, "num_tokens": 451293.0, "repeat_count": 3.0, "routers_loss": 0.23832914233207703, "skip_count": 5.0, "step": 286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5679180887372013, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.40625, "learning_rate": 0.000574, "loss": 0.1361, "macro_f1": 0.3272727429866791, "num_tokens": 454069.0, "repeat_count": 1.0, "routers_loss": 0.14267782866954803, "skip_count": 0.0, "step": 288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.578839590443686, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.65625, "learning_rate": 0.000578, "loss": 0.1921, "macro_f1": 0.31446540355682373, "num_tokens": 457308.0, "repeat_count": 0.0, "routers_loss": 0.3219856917858124, "skip_count": 2.0, "step": 290, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5897610921501708, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.90625, "learning_rate": 0.0005819999999999999, "loss": 0.2214, "macro_f1": 0.31446540355682373, "num_tokens": 460138.0, "repeat_count": 1.0, "routers_loss": 0.4478992521762848, "skip_count": 1.0, "step": 292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6006825938566553, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.9375, "learning_rate": 0.0005859999999999999, "loss": 0.2102, "macro_f1": 0.3333333432674408, "num_tokens": 464029.0, "repeat_count": 0.0, "routers_loss": 0.019972749054431915, "skip_count": 0.0, "step": 294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6116040955631399, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.00059, "loss": 0.1164, "macro_f1": 0.3076923191547394, "num_tokens": 467500.0, "repeat_count": 1.0, "routers_loss": 0.14752870798110962, "skip_count": 3.0, "step": 296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6225255972696244, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.75, "learning_rate": 0.000594, "loss": 0.1434, "macro_f1": 0.32098764181137085, "num_tokens": 470734.0, "repeat_count": 1.0, "routers_loss": 0.30419600009918213, "skip_count": 1.0, "step": 298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.6334470989761092, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.000598, "loss": 0.2077, "macro_f1": 0.31446540355682373, "num_tokens": 474514.0, "repeat_count": 0.0, "routers_loss": 0.06921514868736267, "skip_count": 2.0, "step": 300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.644368600682594, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.125, "learning_rate": 0.000602, "loss": 0.1566, "macro_f1": 0.3076923191547394, "num_tokens": 477393.0, "repeat_count": 0.0, "routers_loss": 0.2468976378440857, "skip_count": 2.0, "step": 302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.6552901023890785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.000606, "loss": 0.1649, "macro_f1": 0.3272727429866791, "num_tokens": 480381.0, "repeat_count": 0.0, "routers_loss": 0.020447812974452972, "skip_count": 0.0, "step": 304, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.666211604095563, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5, "learning_rate": 0.00061, "loss": 0.1423, "macro_f1": 0.31446540355682373, "num_tokens": 483502.0, "repeat_count": 0.0, "routers_loss": 0.05023586004972458, "skip_count": 1.0, "step": 306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 1.6771331058020478, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0, "learning_rate": 0.000614, "loss": 0.2042, "macro_f1": 0.3144654333591461, "num_tokens": 488006.0, "repeat_count": 0.0, "routers_loss": 0.049936871975660324, "skip_count": 0.0, "step": 308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6880546075085325, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.0006180000000000001, "loss": 0.2121, "macro_f1": 0.3272727429866791, "num_tokens": 491611.0, "repeat_count": 1.0, "routers_loss": 0.20010031759738922, "skip_count": 0.0, "step": 310, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.698976109215017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.000622, "loss": 0.2415, "macro_f1": 0.3333333432674408, "num_tokens": 494903.0, "repeat_count": 0.0, "routers_loss": 0.01630268059670925, "skip_count": 0.0, "step": 312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.7098976109215016, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000626, "loss": 0.2042, "macro_f1": 0.32098767161369324, "num_tokens": 497949.0, "repeat_count": 0.0, "routers_loss": 0.2674679160118103, "skip_count": 1.0, "step": 314, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 1.7208191126279864, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.5, "learning_rate": 0.00063, "loss": 0.1844, "macro_f1": 0.8823530077934265, "num_tokens": 501082.0, "repeat_count": 1.0, "routers_loss": 0.1621737778186798, "skip_count": 2.0, "step": 316, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.7317406143344711, "f1_execute": 0.8979592323303223, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 9.125, "learning_rate": 0.000634, "loss": 0.1708, "macro_f1": 0.5215420126914978, "num_tokens": 504131.0, "repeat_count": 2.0, "routers_loss": 0.6877225041389465, "skip_count": 2.0, "step": 318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 1.7426621160409557, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.6875, "learning_rate": 0.000638, "loss": 0.1874, "macro_f1": 0.29333335161209106, "num_tokens": 507012.0, "repeat_count": 0.0, "routers_loss": 0.14521881937980652, "skip_count": 2.0, "step": 320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 23.0, "epoch": 1.7535836177474402, "f1_execute": 0.8936170339584351, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 4.46875, "learning_rate": 0.000642, "loss": 0.1489, "macro_f1": 0.44602054357528687, "num_tokens": 509950.0, "repeat_count": 0.0, "routers_loss": 0.15650968253612518, "skip_count": 4.0, "step": 322, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 1.764505119453925, "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.96875, "learning_rate": 0.000646, "loss": 0.163, "macro_f1": 0.2777777910232544, "num_tokens": 512900.0, "repeat_count": 2.0, "routers_loss": 0.3924711048603058, "skip_count": 3.0, "step": 324, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.7754266211604095, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.546875, "learning_rate": 0.0006500000000000001, "loss": 0.1452, "macro_f1": 0.5492662787437439, "num_tokens": 516233.0, "repeat_count": 0.0, "routers_loss": 0.038907092064619064, "skip_count": 2.0, "step": 326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7863481228668943, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.796875, "learning_rate": 0.0006540000000000001, "loss": 0.1641, "macro_f1": 0.3333333432674408, "num_tokens": 519636.0, "repeat_count": 0.0, "routers_loss": 0.0022514634765684605, "skip_count": 0.0, "step": 328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 1.7972696245733788, "f1_execute": 0.9166666865348816, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 7.03125, "learning_rate": 0.0006580000000000001, "loss": 0.2761, "macro_f1": 0.4722222685813904, "num_tokens": 522992.0, "repeat_count": 2.0, "routers_loss": 0.4415050148963928, "skip_count": 2.0, "step": 330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.8081911262798633, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.4375, "learning_rate": 0.000662, "loss": 0.1657, "macro_f1": 0.32098767161369324, "num_tokens": 526843.0, "repeat_count": 0.0, "routers_loss": 0.06788615882396698, "skip_count": 1.0, "step": 332, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.819112627986348, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 5.78125, "learning_rate": 0.000666, "loss": 0.1996, "macro_f1": 0.6603773832321167, "num_tokens": 530177.0, "repeat_count": 1.0, "routers_loss": 0.06985973566770554, "skip_count": 1.0, "step": 334, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.8300341296928329, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.46875, "learning_rate": 0.00067, "loss": 0.1877, "macro_f1": 0.307692289352417, "num_tokens": 533183.0, "repeat_count": 1.0, "routers_loss": 0.33230671286582947, "skip_count": 2.0, "step": 336, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8409556313993174, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.6875, "learning_rate": 0.000674, "loss": 0.1249, "macro_f1": 0.3076923191547394, "num_tokens": 536858.0, "repeat_count": 0.0, "routers_loss": 0.15104004740715027, "skip_count": 2.0, "step": 338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.851877133105802, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.625, "learning_rate": 0.0006780000000000001, "loss": 0.1885, "macro_f1": 0.3272727429866791, "num_tokens": 540769.0, "repeat_count": 0.0, "routers_loss": 0.032123174518346786, "skip_count": 0.0, "step": 340, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8627986348122867, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.96875, "learning_rate": 0.0006820000000000001, "loss": 0.1809, "macro_f1": 0.3272727429866791, "num_tokens": 543783.0, "repeat_count": 0.0, "routers_loss": 0.05651572719216347, "skip_count": 1.0, "step": 342, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.8737201365187715, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.6875, "learning_rate": 0.0006860000000000001, "loss": 0.1804, "macro_f1": 0.3076923191547394, "num_tokens": 547125.0, "repeat_count": 0.0, "routers_loss": 0.13617995381355286, "skip_count": 2.0, "step": 344, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.884641638225256, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.65625, "learning_rate": 0.00069, "loss": 0.204, "macro_f1": 0.3272727429866791, "num_tokens": 550591.0, "repeat_count": 0.0, "routers_loss": 0.023369189351797104, "skip_count": 0.0, "step": 346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8955631399317405, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.625, "learning_rate": 0.000694, "loss": 0.2275, "macro_f1": 0.3272727429866791, "num_tokens": 553785.0, "repeat_count": 0.0, "routers_loss": 0.09765879064798355, "skip_count": 1.0, "step": 348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9064846416382253, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.5, "learning_rate": 0.0006979999999999999, "loss": 0.4191, "macro_f1": 0.3333333432674408, "num_tokens": 556135.0, "repeat_count": 0.0, "routers_loss": 0.011158714070916176, "skip_count": 0.0, "step": 350, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.91740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.53125, "learning_rate": 0.0007019999999999999, "loss": 0.1557, "macro_f1": 0.3272727429866791, "num_tokens": 558980.0, "repeat_count": 0.0, "routers_loss": 0.036593515425920486, "skip_count": 0.0, "step": 352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 26.0, "epoch": 1.9283276450511946, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 4.1875, "learning_rate": 0.0007059999999999999, "loss": 0.183, "macro_f1": 0.4104308485984802, "num_tokens": 562187.0, "repeat_count": 1.0, "routers_loss": 0.48064568638801575, "skip_count": 4.0, "step": 354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.9392491467576791, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.0, "learning_rate": 0.00071, "loss": 0.1982, "macro_f1": 0.32098767161369324, "num_tokens": 565278.0, "repeat_count": 0.0, "routers_loss": 0.13826458156108856, "skip_count": 1.0, "step": 356, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9501706484641637, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.90625, "learning_rate": 0.000714, "loss": 0.2709, "macro_f1": 0.3333333432674408, "num_tokens": 567869.0, "repeat_count": 0.0, "routers_loss": 0.01589345932006836, "skip_count": 0.0, "step": 358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.9610921501706484, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.09375, "learning_rate": 0.000718, "loss": 0.1902, "macro_f1": 0.3272727429866791, "num_tokens": 571069.0, "repeat_count": 0.0, "routers_loss": 0.029062755405902863, "skip_count": 0.0, "step": 360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.9720136518771332, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.125, "learning_rate": 0.000722, "loss": 0.2125, "macro_f1": 0.3076923191547394, "num_tokens": 573838.0, "repeat_count": 1.0, "routers_loss": 0.3241157531738281, "skip_count": 1.0, "step": 362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.9829351535836177, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.000726, "loss": 0.2176, "macro_f1": 0.3272727429866791, "num_tokens": 576554.0, "repeat_count": 0.0, "routers_loss": 0.03469887003302574, "skip_count": 0.0, "step": 364, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 1.9938566552901023, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 7.34375, "learning_rate": 0.00073, "loss": 0.182, "macro_f1": 0.4803921580314636, "num_tokens": 579653.0, "repeat_count": 1.0, "routers_loss": 0.11800751090049744, "skip_count": 1.0, "step": 366, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.125, "learning_rate": 0.000734, "loss": 0.3307, "macro_f1": 0.3333333432674408, "num_tokens": 581832.0, "repeat_count": 0.0, "routers_loss": 0.014465595595538616, "skip_count": 0.0, "step": 368, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.0109215017064845, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.375, "learning_rate": 0.000738, "loss": 0.1482, "macro_f1": 0.3272727429866791, "num_tokens": 585207.0, "repeat_count": 0.0, "routers_loss": 0.030198052525520325, "skip_count": 0.0, "step": 370, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.021843003412969, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.78125, "learning_rate": 0.000742, "loss": 0.0906, "macro_f1": 0.32098767161369324, "num_tokens": 588893.0, "repeat_count": 0.0, "routers_loss": 0.04226446524262428, "skip_count": 1.0, "step": 372, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 2.032764505119454, "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 8.0625, "learning_rate": 0.000746, "loss": 0.2092, "macro_f1": 0.9259259104728699, "num_tokens": 592246.0, "repeat_count": 3.0, "routers_loss": 0.05995782092213631, "skip_count": 3.0, "step": 374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.0436860068259386, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.00075, "loss": 0.1724, "macro_f1": 0.3006536066532135, "num_tokens": 594777.0, "repeat_count": 0.0, "routers_loss": 0.14366891980171204, "skip_count": 3.0, "step": 376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.054607508532423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.90625, "learning_rate": 0.000754, "loss": 0.0803, "macro_f1": 0.3333333432674408, "num_tokens": 597931.0, "repeat_count": 0.0, "routers_loss": 0.0027963866014033556, "skip_count": 0.0, "step": 378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 2.0655290102389077, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.28125, "learning_rate": 0.000758, "loss": 0.2873, "macro_f1": 0.5359477400779724, "num_tokens": 601227.0, "repeat_count": 0.0, "routers_loss": 0.15012779831886292, "skip_count": 2.0, "step": 380, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.0764505119453927, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 4.96875, "learning_rate": 0.000762, "loss": 0.1602, "macro_f1": 0.5427350401878357, "num_tokens": 604297.0, "repeat_count": 2.0, "routers_loss": 0.0708698183298111, "skip_count": 1.0, "step": 382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 28.0, "epoch": 2.087372013651877, "f1_execute": 0.8510638475418091, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 8.25, "learning_rate": 0.0007660000000000001, "loss": 0.1786, "macro_f1": 0.3947990834712982, "num_tokens": 607137.0, "repeat_count": 2.0, "routers_loss": 0.46035754680633545, "skip_count": 5.0, "step": 384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.0982935153583617, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.8125, "learning_rate": 0.0007700000000000001, "loss": 0.1415, "macro_f1": 0.4871794879436493, "num_tokens": 610067.0, "repeat_count": 0.0, "routers_loss": 0.04594701901078224, "skip_count": 2.0, "step": 386, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 26.0, "epoch": 2.1092150170648463, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 5.59375, "learning_rate": 0.0007740000000000001, "loss": 0.1453, "macro_f1": 0.42403626441955566, "num_tokens": 613020.0, "repeat_count": 1.0, "routers_loss": 0.21872307360172272, "skip_count": 4.0, "step": 388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1201365187713312, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.000778, "loss": 0.2459, "macro_f1": 0.3006536066532135, "num_tokens": 615777.0, "repeat_count": 0.0, "routers_loss": 0.17068128287792206, "skip_count": 3.0, "step": 390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.131058020477816, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.40625, "learning_rate": 0.000782, "loss": 0.1734, "macro_f1": 0.5492662787437439, "num_tokens": 618883.0, "repeat_count": 0.0, "routers_loss": 0.06883871555328369, "skip_count": 2.0, "step": 392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.1419795221843003, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 4.4375, "learning_rate": 0.000786, "loss": 0.1822, "macro_f1": 0.4871794879436493, "num_tokens": 621785.0, "repeat_count": 0.0, "routers_loss": 0.021629702299833298, "skip_count": 2.0, "step": 394, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.152901023890785, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 10.4375, "learning_rate": 0.00079, "loss": 0.2188, "macro_f1": 0.4871794879436493, "num_tokens": 624497.0, "repeat_count": 0.0, "routers_loss": 0.02989846095442772, "skip_count": 2.0, "step": 396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1638225255972694, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.0007940000000000001, "loss": 0.2, "macro_f1": 0.3333333432674408, "num_tokens": 627530.0, "repeat_count": 0.0, "routers_loss": 0.0030090075451880693, "skip_count": 0.0, "step": 398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.1747440273037544, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.0007980000000000001, "loss": 0.1503, "macro_f1": 0.3272727429866791, "num_tokens": 630816.0, "repeat_count": 0.0, "routers_loss": 0.02026674523949623, "skip_count": 0.0, "step": 400, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.185665529010239, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.6875, "learning_rate": 0.0008020000000000001, "loss": 0.1285, "macro_f1": 0.3272727429866791, "num_tokens": 633715.0, "repeat_count": 1.0, "routers_loss": 0.08777285367250443, "skip_count": 0.0, "step": 402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1965870307167235, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.59375, "learning_rate": 0.0008060000000000001, "loss": 0.186, "macro_f1": 0.3272727429866791, "num_tokens": 636871.0, "repeat_count": 0.0, "routers_loss": 0.049915000796318054, "skip_count": 1.0, "step": 404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.34375, "learning_rate": 0.0008100000000000001, "loss": 0.1592, "macro_f1": 0.5492662787437439, "num_tokens": 639784.0, "repeat_count": 0.0, "routers_loss": 0.05443386733531952, "skip_count": 2.0, "step": 406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.218430034129693, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.90625, "learning_rate": 0.0008139999999999999, "loss": 0.1947, "macro_f1": 0.3272727429866791, "num_tokens": 642682.0, "repeat_count": 0.0, "routers_loss": 0.021953796967864037, "skip_count": 0.0, "step": 408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2293515358361775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.625, "learning_rate": 0.0008179999999999999, "loss": 0.2197, "macro_f1": 0.3333333432674408, "num_tokens": 645962.0, "repeat_count": 0.0, "routers_loss": 0.010657553561031818, "skip_count": 0.0, "step": 410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.240273037542662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.421875, "learning_rate": 0.0008219999999999999, "loss": 0.2091, "macro_f1": 0.3333333432674408, "num_tokens": 649180.0, "repeat_count": 0.0, "routers_loss": 0.013879667967557907, "skip_count": 0.0, "step": 412, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.2511945392491466, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0625, "learning_rate": 0.000826, "loss": 0.1555, "macro_f1": 0.31446540355682373, "num_tokens": 653015.0, "repeat_count": 0.0, "routers_loss": 0.12807206809520721, "skip_count": 2.0, "step": 414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 25.0, "epoch": 2.2621160409556316, "f1_execute": 0.9166666269302368, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.453125, "learning_rate": 0.00083, "loss": 0.1335, "macro_f1": 0.5277777910232544, "num_tokens": 655892.0, "repeat_count": 2.0, "routers_loss": 0.8250671625137329, "skip_count": 3.0, "step": 416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.273037542662116, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.8125, "learning_rate": 0.000834, "loss": 0.1831, "macro_f1": 0.5492662787437439, "num_tokens": 658426.0, "repeat_count": 0.0, "routers_loss": 0.03139641508460045, "skip_count": 2.0, "step": 418, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2839590443686006, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 3.40625, "learning_rate": 0.000838, "loss": 0.1345, "macro_f1": 0.5427350401878357, "num_tokens": 661809.0, "repeat_count": 2.0, "routers_loss": 0.0441780611872673, "skip_count": 0.0, "step": 420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.294880546075085, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000842, "loss": 0.1127, "macro_f1": 0.3272727429866791, "num_tokens": 664874.0, "repeat_count": 0.0, "routers_loss": 0.44332680106163025, "skip_count": 1.0, "step": 422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3058020477815697, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.96875, "learning_rate": 0.000846, "loss": 0.1225, "macro_f1": 0.3272727429866791, "num_tokens": 668325.0, "repeat_count": 0.0, "routers_loss": 0.059455983340740204, "skip_count": 0.0, "step": 424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.3167235494880547, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.8125, "learning_rate": 0.00085, "loss": 0.1816, "macro_f1": 0.5359477400779724, "num_tokens": 671097.0, "repeat_count": 2.0, "routers_loss": 0.3154633641242981, "skip_count": 2.0, "step": 426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 2.3276450511945392, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 5.65625, "learning_rate": 0.000854, "loss": 0.122, "macro_f1": 0.4104308784008026, "num_tokens": 674042.0, "repeat_count": 1.0, "routers_loss": 0.4580267667770386, "skip_count": 3.0, "step": 428, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3385665529010238, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.21875, "learning_rate": 0.000858, "loss": 0.1113, "macro_f1": 0.3272727429866791, "num_tokens": 677016.0, "repeat_count": 0.0, "routers_loss": 0.015222650021314621, "skip_count": 0.0, "step": 430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3494880546075088, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.78125, "learning_rate": 0.000862, "loss": 0.1379, "macro_f1": 0.3333333432674408, "num_tokens": 679990.0, "repeat_count": 1.0, "routers_loss": 0.24279196560382843, "skip_count": 0.0, "step": 432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3604095563139933, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.28125, "learning_rate": 0.000866, "loss": 0.1476, "macro_f1": 0.3333333432674408, "num_tokens": 682786.0, "repeat_count": 1.0, "routers_loss": 0.1684337556362152, "skip_count": 0.0, "step": 434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.371331058020478, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.0, "learning_rate": 0.00087, "loss": 0.1204, "macro_f1": 0.3272727429866791, "num_tokens": 685882.0, "repeat_count": 1.0, "routers_loss": 0.19464725255966187, "skip_count": 0.0, "step": 436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3822525597269624, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.71875, "learning_rate": 0.000874, "loss": 0.1124, "macro_f1": 0.32098764181137085, "num_tokens": 689570.0, "repeat_count": 0.0, "routers_loss": 0.05968143790960312, "skip_count": 2.0, "step": 438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.393174061433447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.84375, "learning_rate": 0.000878, "loss": 0.1528, "macro_f1": 0.3333333432674408, "num_tokens": 693559.0, "repeat_count": 0.0, "routers_loss": 0.004517437424510717, "skip_count": 0.0, "step": 440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.404095563139932, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.125, "learning_rate": 0.000882, "loss": 0.1353, "macro_f1": 0.3006536066532135, "num_tokens": 696374.0, "repeat_count": 0.0, "routers_loss": 0.26632770895957947, "skip_count": 2.0, "step": 442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.4150170648464164, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.75, "learning_rate": 0.0008860000000000001, "loss": 0.1874, "macro_f1": 0.2857142984867096, "num_tokens": 699954.0, "repeat_count": 1.0, "routers_loss": 0.3751397728919983, "skip_count": 3.0, "step": 444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.425938566552901, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.28125, "learning_rate": 0.0008900000000000001, "loss": 0.2139, "macro_f1": 0.32098764181137085, "num_tokens": 703477.0, "repeat_count": 0.0, "routers_loss": 0.2166936844587326, "skip_count": 2.0, "step": 446, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4368600682593855, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0625, "learning_rate": 0.000894, "loss": 0.3078, "macro_f1": 0.3333333432674408, "num_tokens": 706342.0, "repeat_count": 0.0, "routers_loss": 0.004165076185017824, "skip_count": 0.0, "step": 448, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.4477815699658705, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.000898, "loss": 0.3248, "macro_f1": 0.307692289352417, "num_tokens": 709048.0, "repeat_count": 0.0, "routers_loss": 0.11787679046392441, "skip_count": 1.0, "step": 450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.458703071672355, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.875, "learning_rate": 0.000902, "loss": 0.2151, "macro_f1": 0.31446540355682373, "num_tokens": 712168.0, "repeat_count": 2.0, "routers_loss": 0.24694015085697174, "skip_count": 0.0, "step": 452, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.4696245733788396, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 9.0625, "learning_rate": 0.000906, "loss": 0.1899, "macro_f1": 0.5492662787437439, "num_tokens": 715867.0, "repeat_count": 0.0, "routers_loss": 0.14055466651916504, "skip_count": 2.0, "step": 454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.480546075085324, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.84375, "learning_rate": 0.00091, "loss": 0.136, "macro_f1": 0.32098764181137085, "num_tokens": 718940.0, "repeat_count": 0.0, "routers_loss": 0.2996567487716675, "skip_count": 2.0, "step": 456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.491467576791809, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 8.5625, "learning_rate": 0.0009140000000000001, "loss": 0.2439, "macro_f1": 0.5492662787437439, "num_tokens": 721407.0, "repeat_count": 0.0, "routers_loss": 0.032011453062295914, "skip_count": 2.0, "step": 458, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.5023890784982936, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.0, "learning_rate": 0.0009180000000000001, "loss": 0.2592, "macro_f1": 0.3144654333591461, "num_tokens": 726056.0, "repeat_count": 0.0, "routers_loss": 0.06647517532110214, "skip_count": 0.0, "step": 460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.513310580204778, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.875, "learning_rate": 0.0009220000000000001, "loss": 0.1904, "macro_f1": 0.32098764181137085, "num_tokens": 729038.0, "repeat_count": 0.0, "routers_loss": 0.08919267356395721, "skip_count": 0.0, "step": 462, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.5242320819112627, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.46875, "learning_rate": 0.0009260000000000001, "loss": 0.1969, "macro_f1": 0.3006536066532135, "num_tokens": 732172.0, "repeat_count": 0.0, "routers_loss": 0.4903416037559509, "skip_count": 2.0, "step": 464, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 2.5351535836177472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.5, "learning_rate": 0.00093, "loss": 0.1957, "macro_f1": 0.6666666865348816, "num_tokens": 735282.0, "repeat_count": 0.0, "routers_loss": 0.025489339604973793, "skip_count": 2.0, "step": 466, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.546075085324232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.28125, "learning_rate": 0.000934, "loss": 0.2198, "macro_f1": 0.3333333432674408, "num_tokens": 739208.0, "repeat_count": 0.0, "routers_loss": 0.013121264986693859, "skip_count": 0.0, "step": 468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5569965870307167, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.15625, "learning_rate": 0.0009379999999999999, "loss": 0.3641, "macro_f1": 0.32098764181137085, "num_tokens": 741980.0, "repeat_count": 0.0, "routers_loss": 0.45740270614624023, "skip_count": 2.0, "step": 470, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.5679180887372013, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000942, "loss": 0.1668, "macro_f1": 0.31446540355682373, "num_tokens": 745551.0, "repeat_count": 0.0, "routers_loss": 0.1244814470410347, "skip_count": 2.0, "step": 472, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.5788395904436863, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.25, "learning_rate": 0.000946, "loss": 0.2807, "macro_f1": 0.2857142984867096, "num_tokens": 748488.0, "repeat_count": 1.0, "routers_loss": 0.3303976058959961, "skip_count": 3.0, "step": 474, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 2.589761092150171, "f1_execute": 0.9411764740943909, "f1_repeat": 0.4000000059604645, "f1_skip": 0.0, "grad_norm": 3.640625, "learning_rate": 0.00095, "loss": 0.1353, "macro_f1": 0.44705885648727417, "num_tokens": 752865.0, "repeat_count": 3.0, "routers_loss": 0.24396798014640808, "skip_count": 0.0, "step": 476, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.6006825938566553, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.59375, "learning_rate": 0.000954, "loss": 0.1584, "macro_f1": 0.4400000274181366, "num_tokens": 755653.0, "repeat_count": 0.0, "routers_loss": 0.09343712776899338, "skip_count": 3.0, "step": 478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.61160409556314, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.4375, "learning_rate": 0.000958, "loss": 0.2014, "macro_f1": 0.3272727429866791, "num_tokens": 758567.0, "repeat_count": 0.0, "routers_loss": 0.03879999741911888, "skip_count": 1.0, "step": 480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6225255972696244, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.000962, "loss": 0.2174, "macro_f1": 0.32098764181137085, "num_tokens": 762013.0, "repeat_count": 0.0, "routers_loss": 0.13902239501476288, "skip_count": 2.0, "step": 482, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.6334470989761094, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.000966, "loss": 0.2322, "macro_f1": 0.3272727429866791, "num_tokens": 764820.0, "repeat_count": 0.0, "routers_loss": 0.0281832292675972, "skip_count": 0.0, "step": 484, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 2.644368600682594, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.25, "learning_rate": 0.0009699999999999999, "loss": 0.178, "macro_f1": 0.29333335161209106, "num_tokens": 767962.0, "repeat_count": 0.0, "routers_loss": 0.3387240767478943, "skip_count": 2.0, "step": 486, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.6552901023890785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.000974, "loss": 0.1818, "macro_f1": 0.32098764181137085, "num_tokens": 771189.0, "repeat_count": 0.0, "routers_loss": 0.033774666488170624, "skip_count": 0.0, "step": 488, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.666211604095563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.875, "learning_rate": 0.000978, "loss": 0.2071, "macro_f1": 0.3333333432674408, "num_tokens": 774073.0, "repeat_count": 0.0, "routers_loss": 0.009604716673493385, "skip_count": 0.0, "step": 490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6771331058020476, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.000982, "loss": 0.1853, "macro_f1": 0.3333333432674408, "num_tokens": 776722.0, "repeat_count": 0.0, "routers_loss": 0.0034638401120901108, "skip_count": 0.0, "step": 492, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6880546075085325, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.6875, "learning_rate": 0.0009860000000000001, "loss": 0.2882, "macro_f1": 0.32098764181137085, "num_tokens": 780051.0, "repeat_count": 0.0, "routers_loss": 0.08520562946796417, "skip_count": 0.0, "step": 494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.698976109215017, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.875, "learning_rate": 0.00099, "loss": 0.1995, "macro_f1": 0.3272727429866791, "num_tokens": 782813.0, "repeat_count": 0.0, "routers_loss": 0.16369783878326416, "skip_count": 1.0, "step": 496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.7098976109215016, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.000994, "loss": 0.1725, "macro_f1": 0.3006536066532135, "num_tokens": 785376.0, "repeat_count": 0.0, "routers_loss": 0.17243081331253052, "skip_count": 2.0, "step": 498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 26.0, "epoch": 2.7208191126279866, "f1_execute": 0.8749999403953552, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 9.9375, "learning_rate": 0.000998, "loss": 0.1842, "macro_f1": 0.402777761220932, "num_tokens": 788030.0, "repeat_count": 2.0, "routers_loss": 0.15272235870361328, "skip_count": 4.0, "step": 500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.731740614334471, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.4375, "learning_rate": 0.0009999999674012276, "loss": 0.1709, "macro_f1": 0.32098764181137085, "num_tokens": 791099.0, "repeat_count": 0.0, "routers_loss": 0.02299564890563488, "skip_count": 0.0, "step": 502, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.7426621160409557, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.96875, "learning_rate": 0.000999999706611075, "loss": 0.1858, "macro_f1": 0.3144654333591461, "num_tokens": 794155.0, "repeat_count": 0.0, "routers_loss": 0.0592501275241375, "skip_count": 0.0, "step": 504, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.75358361774744, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.71875, "learning_rate": 0.0009999991850309056, "loss": 0.1347, "macro_f1": 0.307692289352417, "num_tokens": 797457.0, "repeat_count": 0.0, "routers_loss": 0.07785549014806747, "skip_count": 1.0, "step": 506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.7645051194539247, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.25, "learning_rate": 0.0009999984026609918, "loss": 0.1448, "macro_f1": 0.4803921580314636, "num_tokens": 800614.0, "repeat_count": 0.0, "routers_loss": 0.32612788677215576, "skip_count": 2.0, "step": 508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.7754266211604097, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.84375, "learning_rate": 0.0009999973595017412, "loss": 0.2566, "macro_f1": 0.3272727429866791, "num_tokens": 804027.0, "repeat_count": 0.0, "routers_loss": 0.03253546729683876, "skip_count": 0.0, "step": 510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 2.7863481228668943, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.6875, "learning_rate": 0.0009999960555536983, "loss": 0.1271, "macro_f1": 0.5359477400779724, "num_tokens": 807662.0, "repeat_count": 1.0, "routers_loss": 0.16023527085781097, "skip_count": 2.0, "step": 512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.797269624573379, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.375, "learning_rate": 0.0009999944908175428, "loss": 0.1876, "macro_f1": 0.3272727429866791, "num_tokens": 810905.0, "repeat_count": 0.0, "routers_loss": 0.022885220125317574, "skip_count": 0.0, "step": 514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8081911262798633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.078125, "learning_rate": 0.0009999926652940912, "loss": 0.1309, "macro_f1": 0.3333333432674408, "num_tokens": 814110.0, "repeat_count": 0.0, "routers_loss": 0.007647325750440359, "skip_count": 0.0, "step": 516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.819112627986348, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.375, "learning_rate": 0.0009999905789842955, "loss": 0.2302, "macro_f1": 0.32098767161369324, "num_tokens": 816905.0, "repeat_count": 1.0, "routers_loss": 0.0514276959002018, "skip_count": 0.0, "step": 518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.830034129692833, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.875, "learning_rate": 0.0009999882318892442, "loss": 0.2078, "macro_f1": 0.31446540355682373, "num_tokens": 819821.0, "repeat_count": 2.0, "routers_loss": 0.3009680211544037, "skip_count": 0.0, "step": 520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.8409556313993174, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.859375, "learning_rate": 0.000999985624010161, "loss": 0.1296, "macro_f1": 0.32098767161369324, "num_tokens": 822580.0, "repeat_count": 0.0, "routers_loss": 0.05273444578051567, "skip_count": 1.0, "step": 522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.5625, "learning_rate": 0.0009999827553484064, "loss": 0.2293, "macro_f1": 0.3333333432674408, "num_tokens": 825874.0, "repeat_count": 0.0, "routers_loss": 0.008311637677252293, "skip_count": 0.0, "step": 524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.862798634812287, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.921875, "learning_rate": 0.0009999796259054763, "loss": 0.1759, "macro_f1": 0.29333335161209106, "num_tokens": 829040.0, "repeat_count": 3.0, "routers_loss": 1.207849383354187, "skip_count": 2.0, "step": 526, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8737201365187715, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.65625, "learning_rate": 0.0009999762356830036, "loss": 0.2089, "macro_f1": 0.3006536364555359, "num_tokens": 834261.0, "repeat_count": 2.0, "routers_loss": 0.5721967220306396, "skip_count": 3.0, "step": 528, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.884641638225256, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.6875, "learning_rate": 0.000999972584682756, "loss": 0.2308, "macro_f1": 0.29333335161209106, "num_tokens": 837501.0, "repeat_count": 0.0, "routers_loss": 0.09908123314380646, "skip_count": 2.0, "step": 530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.8955631399317405, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.6875, "learning_rate": 0.0009999686729066381, "loss": 0.1818, "macro_f1": 0.32098764181137085, "num_tokens": 840390.0, "repeat_count": 0.0, "routers_loss": 0.04153004288673401, "skip_count": 0.0, "step": 532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 2.906484641638225, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.09375, "learning_rate": 0.0009999645003566902, "loss": 0.1759, "macro_f1": 0.4400000274181366, "num_tokens": 843327.0, "repeat_count": 1.0, "routers_loss": 0.37754446268081665, "skip_count": 3.0, "step": 534, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.91740614334471, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.953125, "learning_rate": 0.0009999600670350882, "loss": 0.1873, "macro_f1": 0.4871794879436493, "num_tokens": 847028.0, "repeat_count": 0.0, "routers_loss": 0.03440186381340027, "skip_count": 2.0, "step": 536, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.9283276450511946, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.875, "learning_rate": 0.000999955372944145, "loss": 0.342, "macro_f1": 0.29333335161209106, "num_tokens": 850735.0, "repeat_count": 1.0, "routers_loss": 0.18292225897312164, "skip_count": 0.0, "step": 538, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.939249146757679, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.421875, "learning_rate": 0.0009999504180863087, "loss": 0.1714, "macro_f1": 0.32098764181137085, "num_tokens": 854731.0, "repeat_count": 1.0, "routers_loss": 0.31060779094696045, "skip_count": 1.0, "step": 540, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9501706484641637, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.65625, "learning_rate": 0.0009999452024641636, "loss": 0.1744, "macro_f1": 0.3144654333591461, "num_tokens": 858249.0, "repeat_count": 1.0, "routers_loss": 0.09356094151735306, "skip_count": 2.0, "step": 542, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.961092150170648, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.4375, "learning_rate": 0.0009999397260804302, "loss": 0.1456, "macro_f1": 0.3333333432674408, "num_tokens": 860901.0, "repeat_count": 0.0, "routers_loss": 0.006649349816143513, "skip_count": 0.0, "step": 544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.972013651877133, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.0, "learning_rate": 0.0009999339889379647, "loss": 0.191, "macro_f1": 0.3272727429866791, "num_tokens": 863756.0, "repeat_count": 0.0, "routers_loss": 0.024081196635961533, "skip_count": 0.0, "step": 546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.9829351535836177, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.328125, "learning_rate": 0.0009999279910397597, "loss": 0.1806, "macro_f1": 0.4871794879436493, "num_tokens": 867242.0, "repeat_count": 0.0, "routers_loss": 0.06612888723611832, "skip_count": 2.0, "step": 548, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9938566552901023, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.09375, "learning_rate": 0.000999921732388943, "loss": 0.1438, "macro_f1": 0.32098764181137085, "num_tokens": 870235.0, "repeat_count": 0.0, "routers_loss": 0.02564089559018612, "skip_count": 0.0, "step": 550, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.0, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.0, "learning_rate": 0.0009999152129887801, "loss": 0.1395, "macro_f1": 0.3006536066532135, "num_tokens": 872748.0, "repeat_count": 1.0, "routers_loss": 0.31180688738822937, "skip_count": 2.0, "step": 552, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 3.0109215017064845, "f1_execute": 0.9523809552192688, "f1_repeat": 0.6666666865348816, "f1_skip": 0.9090909361839294, "grad_norm": 7.8125, "learning_rate": 0.0009999084328426704, "loss": 0.1243, "macro_f1": 0.8427128791809082, "num_tokens": 876257.0, "repeat_count": 1.0, "routers_loss": 0.06441941112279892, "skip_count": 6.0, "step": 554, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.021843003412969, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.4375, "learning_rate": 0.0009999013919541506, "loss": 0.2276, "macro_f1": 0.32098764181137085, "num_tokens": 879189.0, "repeat_count": 0.0, "routers_loss": 0.1297590732574463, "skip_count": 2.0, "step": 556, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.032764505119454, "f1_execute": 0.95652174949646, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5714285373687744, "grad_norm": 2.953125, "learning_rate": 0.0009998940903268932, "loss": 0.1034, "macro_f1": 0.7315390110015869, "num_tokens": 882626.0, "repeat_count": 2.0, "routers_loss": 0.40159890055656433, "skip_count": 4.0, "step": 558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.0436860068259386, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.96875, "learning_rate": 0.0009998865279647066, "loss": 0.1627, "macro_f1": 0.307692289352417, "num_tokens": 885572.0, "repeat_count": 0.0, "routers_loss": 0.05809749290347099, "skip_count": 3.0, "step": 560, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.054607508532423, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.8125, "learning_rate": 0.0009998787048715349, "loss": 0.1533, "macro_f1": 0.31446540355682373, "num_tokens": 889088.0, "repeat_count": 0.0, "routers_loss": 0.4470720589160919, "skip_count": 2.0, "step": 562, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.0655290102389077, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.078125, "learning_rate": 0.0009998706210514589, "loss": 0.167, "macro_f1": 0.3272727429866791, "num_tokens": 892449.0, "repeat_count": 0.0, "routers_loss": 0.017404144629836082, "skip_count": 0.0, "step": 564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 3.0764505119453927, "f1_execute": 0.8749999403953552, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5, "learning_rate": 0.0009998622765086946, "loss": 0.1492, "macro_f1": 0.2916666567325592, "num_tokens": 895586.0, "repeat_count": 1.0, "routers_loss": 0.3639675974845886, "skip_count": 1.0, "step": 566, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 3.087372013651877, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 9.0625, "learning_rate": 0.0009998536712475944, "loss": 0.2095, "macro_f1": 0.4104308485984802, "num_tokens": 898285.0, "repeat_count": 1.0, "routers_loss": 0.16401837766170502, "skip_count": 1.0, "step": 568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.0982935153583617, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.921875, "learning_rate": 0.0009998448052726467, "loss": 0.1679, "macro_f1": 0.5427350401878357, "num_tokens": 901345.0, "repeat_count": 1.0, "routers_loss": 0.2740897238254547, "skip_count": 1.0, "step": 570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1092150170648463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.921875, "learning_rate": 0.000999835678588476, "loss": 0.1513, "macro_f1": 0.3333333432674408, "num_tokens": 904674.0, "repeat_count": 0.0, "routers_loss": 0.004289933945983648, "skip_count": 0.0, "step": 572, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 3.1201365187713312, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.125, "learning_rate": 0.0009998262911998423, "loss": 0.2076, "macro_f1": 0.47333335876464844, "num_tokens": 908392.0, "repeat_count": 1.0, "routers_loss": 0.6915572881698608, "skip_count": 3.0, "step": 574, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 3.131058020477816, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.65625, "learning_rate": 0.000999816643111642, "loss": 0.166, "macro_f1": 0.47959184646606445, "num_tokens": 911574.0, "repeat_count": 3.0, "routers_loss": 0.27853959798812866, "skip_count": 1.0, "step": 576, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.1419795221843003, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.6875, "learning_rate": 0.0009998067343289074, "loss": 0.2197, "macro_f1": 0.3076923191547394, "num_tokens": 914726.0, "repeat_count": 1.0, "routers_loss": 0.39462774991989136, "skip_count": 1.0, "step": 578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.152901023890785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.515625, "learning_rate": 0.0009997965648568066, "loss": 0.1345, "macro_f1": 0.3333333432674408, "num_tokens": 918249.0, "repeat_count": 0.0, "routers_loss": 0.0032140507828444242, "skip_count": 0.0, "step": 580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1638225255972694, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.1875, "learning_rate": 0.000999786134700644, "loss": 0.1132, "macro_f1": 0.3333333432674408, "num_tokens": 921025.0, "repeat_count": 0.0, "routers_loss": 0.0016512145521119237, "skip_count": 0.0, "step": 582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 3.1747440273037544, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.59375, "learning_rate": 0.0009997754438658595, "loss": 0.0915, "macro_f1": 0.3006536066532135, "num_tokens": 924102.0, "repeat_count": 0.0, "routers_loss": 0.6956021785736084, "skip_count": 2.0, "step": 584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 3.185665529010239, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 9.1875, "learning_rate": 0.0009997644923580293, "loss": 0.1437, "macro_f1": 0.5359477400779724, "num_tokens": 927662.0, "repeat_count": 1.0, "routers_loss": 0.32544562220573425, "skip_count": 2.0, "step": 586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.25, "learning_rate": 0.0009997532801828658, "loss": 0.1488, "macro_f1": 0.3333333432674408, "num_tokens": 930556.0, "repeat_count": 0.0, "routers_loss": 0.00869440846145153, "skip_count": 0.0, "step": 588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.207508532423208, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.65625, "learning_rate": 0.0009997418073462167, "loss": 0.1584, "macro_f1": 0.32098764181137085, "num_tokens": 933435.0, "repeat_count": 0.0, "routers_loss": 0.08498232066631317, "skip_count": 2.0, "step": 590, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.218430034129693, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.015625, "learning_rate": 0.0009997300738540662, "loss": 0.1075, "macro_f1": 0.32098764181137085, "num_tokens": 936478.0, "repeat_count": 0.0, "routers_loss": 0.19423364102840424, "skip_count": 2.0, "step": 592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.2293515358361775, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.03125, "learning_rate": 0.000999718079712534, "loss": 0.1615, "macro_f1": 0.5492662787437439, "num_tokens": 939400.0, "repeat_count": 0.0, "routers_loss": 0.02402239292860031, "skip_count": 1.0, "step": 594, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.240273037542662, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.875, "learning_rate": 0.0009997058249278763, "loss": 0.221, "macro_f1": 0.6666666865348816, "num_tokens": 943300.0, "repeat_count": 1.0, "routers_loss": 0.0028402789030224085, "skip_count": 0.0, "step": 596, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2511945392491466, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.0009996933095064847, "loss": 0.1423, "macro_f1": 0.3144654333591461, "num_tokens": 947399.0, "repeat_count": 1.0, "routers_loss": 0.2962486445903778, "skip_count": 2.0, "step": 598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2621160409556316, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.0009996805334548872, "loss": 0.1535, "macro_f1": 0.29333335161209106, "num_tokens": 950094.0, "repeat_count": 0.0, "routers_loss": 0.47425299882888794, "skip_count": 4.0, "step": 600, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.4000000059604645, "avg_layers": 24.0, "epoch": 3.273037542662116, "f1_execute": 0.8636363744735718, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 4.71875, "learning_rate": 0.0009996674967797476, "loss": 0.1282, "macro_f1": 0.43602699041366577, "num_tokens": 953673.0, "repeat_count": 3.0, "routers_loss": 0.3788261115550995, "skip_count": 5.0, "step": 602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2839590443686006, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5625, "learning_rate": 0.0009996541994878655, "loss": 0.1239, "macro_f1": 0.3272727429866791, "num_tokens": 956885.0, "repeat_count": 1.0, "routers_loss": 0.13212358951568604, "skip_count": 0.0, "step": 604, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.294880546075085, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.828125, "learning_rate": 0.0009996406415861763, "loss": 0.0874, "macro_f1": 0.6601307392120361, "num_tokens": 959794.0, "repeat_count": 0.0, "routers_loss": 0.0332571342587471, "skip_count": 2.0, "step": 606, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3058020477815697, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.0009996268230817518, "loss": 0.1068, "macro_f1": 0.3333333432674408, "num_tokens": 963516.0, "repeat_count": 0.0, "routers_loss": 0.007200752384960651, "skip_count": 0.0, "step": 608, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3167235494880547, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.75, "learning_rate": 0.0009996127439817993, "loss": 0.1237, "macro_f1": 0.3272727429866791, "num_tokens": 966363.0, "repeat_count": 0.0, "routers_loss": 0.23764896392822266, "skip_count": 1.0, "step": 610, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3276450511945392, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.0009995984042936621, "loss": 0.1411, "macro_f1": 0.3333333432674408, "num_tokens": 969265.0, "repeat_count": 0.0, "routers_loss": 0.0006030416116118431, "skip_count": 0.0, "step": 612, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.3385665529010238, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.8125, "learning_rate": 0.0009995838040248197, "loss": 0.1516, "macro_f1": 0.5492662787437439, "num_tokens": 972024.0, "repeat_count": 0.0, "routers_loss": 0.029178157448768616, "skip_count": 1.0, "step": 614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 3.3494880546075088, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.0625, "learning_rate": 0.0009995689431828872, "loss": 0.132, "macro_f1": 0.41777777671813965, "num_tokens": 974328.0, "repeat_count": 0.0, "routers_loss": 0.41580793261528015, "skip_count": 2.0, "step": 616, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.3604095563139933, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.625, "learning_rate": 0.000999553821775616, "loss": 0.1495, "macro_f1": 0.307692289352417, "num_tokens": 977628.0, "repeat_count": 0.0, "routers_loss": 0.26905494928359985, "skip_count": 3.0, "step": 618, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.371331058020478, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.0009995384398108927, "loss": 0.1372, "macro_f1": 0.3333333432674408, "num_tokens": 980458.0, "repeat_count": 0.0, "routers_loss": 0.007225328590720892, "skip_count": 0.0, "step": 620, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.3822525597269624, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.984375, "learning_rate": 0.0009995227972967404, "loss": 0.1104, "macro_f1": 0.6603773832321167, "num_tokens": 983776.0, "repeat_count": 1.0, "routers_loss": 0.09698990732431412, "skip_count": 1.0, "step": 622, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.393174061433447, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.40625, "learning_rate": 0.000999506894241318, "loss": 0.1211, "macro_f1": 0.32098764181137085, "num_tokens": 986625.0, "repeat_count": 0.0, "routers_loss": 0.028710627928376198, "skip_count": 0.0, "step": 624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.404095563139932, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.53125, "learning_rate": 0.0009994907306529201, "loss": 0.186, "macro_f1": 0.5427350401878357, "num_tokens": 989896.0, "repeat_count": 1.0, "routers_loss": 0.18436689674854279, "skip_count": 2.0, "step": 626, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.4150170648464164, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.71875, "learning_rate": 0.0009994743065399776, "loss": 0.1819, "macro_f1": 0.6666666865348816, "num_tokens": 992963.0, "repeat_count": 0.0, "routers_loss": 0.011628196574747562, "skip_count": 2.0, "step": 628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.425938566552901, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.5625, "learning_rate": 0.0009994576219110565, "loss": 0.2279, "macro_f1": 0.3272727429866791, "num_tokens": 995486.0, "repeat_count": 0.0, "routers_loss": 0.03694930672645569, "skip_count": 0.0, "step": 630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.4368600682593855, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.0009994406767748596, "loss": 0.2908, "macro_f1": 0.3076923191547394, "num_tokens": 998880.0, "repeat_count": 1.0, "routers_loss": 0.3335764706134796, "skip_count": 1.0, "step": 632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.4477815699658705, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.3125, "learning_rate": 0.000999423471140225, "loss": 0.1652, "macro_f1": 0.4871794879436493, "num_tokens": 1001623.0, "repeat_count": 0.0, "routers_loss": 0.03843867778778076, "skip_count": 2.0, "step": 634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.458703071672355, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.0009994060050161268, "loss": 0.1534, "macro_f1": 0.307692289352417, "num_tokens": 1004900.0, "repeat_count": 2.0, "routers_loss": 0.26561209559440613, "skip_count": 1.0, "step": 636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 3.4696245733788396, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 6.40625, "learning_rate": 0.0009993882784116752, "loss": 0.147, "macro_f1": 0.4803921580314636, "num_tokens": 1008732.0, "repeat_count": 0.0, "routers_loss": 0.3012487590312958, "skip_count": 3.0, "step": 638, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.480546075085324, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.15625, "learning_rate": 0.0009993702913361155, "loss": 0.1252, "macro_f1": 0.3333333432674408, "num_tokens": 1011699.0, "repeat_count": 0.0, "routers_loss": 0.012646762654185295, "skip_count": 0.0, "step": 640, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 3.491467576791809, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.90625, "learning_rate": 0.0009993520437988302, "loss": 0.1487, "macro_f1": 0.480392187833786, "num_tokens": 1014406.0, "repeat_count": 1.0, "routers_loss": 0.1068505123257637, "skip_count": 3.0, "step": 642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.5023890784982936, "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.000999333535809336, "loss": 0.1731, "macro_f1": 0.26950353384017944, "num_tokens": 1017801.0, "repeat_count": 2.0, "routers_loss": 2.2939841747283936, "skip_count": 5.0, "step": 644, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.513310580204778, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.8125, "learning_rate": 0.0009993147673772868, "loss": 0.1609, "macro_f1": 0.3272727429866791, "num_tokens": 1021185.0, "repeat_count": 0.0, "routers_loss": 0.02110578864812851, "skip_count": 0.0, "step": 646, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.5242320819112627, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 4.90625, "learning_rate": 0.000999295738512472, "loss": 0.124, "macro_f1": 0.4533333480358124, "num_tokens": 1025108.0, "repeat_count": 0.0, "routers_loss": 0.15021832287311554, "skip_count": 2.0, "step": 648, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5351535836177472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0, "learning_rate": 0.0009992764492248163, "loss": 0.2309, "macro_f1": 0.3333333432674408, "num_tokens": 1028805.0, "repeat_count": 0.0, "routers_loss": 0.002900304039940238, "skip_count": 0.0, "step": 650, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 3.546075085324232, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.0, "learning_rate": 0.0009992568995243808, "loss": 0.1452, "macro_f1": 0.44705885648727417, "num_tokens": 1032069.0, "repeat_count": 0.0, "routers_loss": 0.2886044383049011, "skip_count": 3.0, "step": 652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5569965870307167, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.0009992370894213623, "loss": 0.1319, "macro_f1": 0.3144654333591461, "num_tokens": 1035634.0, "repeat_count": 1.0, "routers_loss": 0.42971259355545044, "skip_count": 2.0, "step": 654, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 3.5679180887372013, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 7.375, "learning_rate": 0.000999217018926093, "loss": 0.1152, "macro_f1": 0.7795917987823486, "num_tokens": 1039948.0, "repeat_count": 1.0, "routers_loss": 0.07567094266414642, "skip_count": 3.0, "step": 656, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5788395904436863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.46875, "learning_rate": 0.0009991966880490417, "loss": 0.1425, "macro_f1": 0.3333333432674408, "num_tokens": 1043710.0, "repeat_count": 0.0, "routers_loss": 0.001569207408465445, "skip_count": 0.0, "step": 658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.589761092150171, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.453125, "learning_rate": 0.0009991760968008124, "loss": 0.1177, "macro_f1": 0.3333333432674408, "num_tokens": 1047211.0, "repeat_count": 0.0, "routers_loss": 0.014489148743450642, "skip_count": 0.0, "step": 660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.6006825938566553, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.34375, "learning_rate": 0.0009991552451921453, "loss": 0.104, "macro_f1": 0.32098767161369324, "num_tokens": 1050220.0, "repeat_count": 0.0, "routers_loss": 0.052834026515483856, "skip_count": 1.0, "step": 662, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.61160409556314, "f1_execute": 0.875, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.0009991341332339157, "loss": 0.1706, "macro_f1": 0.625, "num_tokens": 1053982.0, "repeat_count": 1.0, "routers_loss": 0.2865705192089081, "skip_count": 3.0, "step": 664, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 3.6225255972696244, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.25, "learning_rate": 0.0009991127609371357, "loss": 0.1275, "macro_f1": 0.307692289352417, "num_tokens": 1056846.0, "repeat_count": 1.0, "routers_loss": 0.32878634333610535, "skip_count": 0.0, "step": 666, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 3.6334470989761094, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 3.328125, "learning_rate": 0.0009990911283129524, "loss": 0.1348, "macro_f1": 0.8814815282821655, "num_tokens": 1059648.0, "repeat_count": 2.0, "routers_loss": 0.10558832436800003, "skip_count": 4.0, "step": 668, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.644368600682594, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.90625, "learning_rate": 0.0009990692353726489, "loss": 0.0572, "macro_f1": 0.6666666865348816, "num_tokens": 1062290.0, "repeat_count": 0.0, "routers_loss": 0.0071791489608585835, "skip_count": 2.0, "step": 670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.6552901023890785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0625, "learning_rate": 0.0009990470821276442, "loss": 0.156, "macro_f1": 0.3272727429866791, "num_tokens": 1065212.0, "repeat_count": 0.0, "routers_loss": 0.028384100645780563, "skip_count": 0.0, "step": 672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.666211604095563, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 7.4375, "learning_rate": 0.0009990246685894933, "loss": 0.1457, "macro_f1": 0.4871794879436493, "num_tokens": 1068029.0, "repeat_count": 0.0, "routers_loss": 0.03461477532982826, "skip_count": 2.0, "step": 674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6771331058020476, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.78125, "learning_rate": 0.0009990019947698863, "loss": 0.1055, "macro_f1": 0.3333333432674408, "num_tokens": 1071229.0, "repeat_count": 0.0, "routers_loss": 0.004003713373094797, "skip_count": 0.0, "step": 676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 3.6880546075085325, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 2.015625, "learning_rate": 0.0009989790606806494, "loss": 0.1026, "macro_f1": 0.5934640765190125, "num_tokens": 1074046.0, "repeat_count": 0.0, "routers_loss": 0.03134514391422272, "skip_count": 3.0, "step": 678, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.698976109215017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.71875, "learning_rate": 0.0009989558663337447, "loss": 0.1402, "macro_f1": 0.6666666865348816, "num_tokens": 1076635.0, "repeat_count": 0.0, "routers_loss": 0.00439166184514761, "skip_count": 1.0, "step": 680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.7098976109215016, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.09375, "learning_rate": 0.0009989324117412699, "loss": 0.1021, "macro_f1": 0.31446540355682373, "num_tokens": 1079958.0, "repeat_count": 0.0, "routers_loss": 0.12589046359062195, "skip_count": 2.0, "step": 682, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7208191126279866, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.0009989086969154587, "loss": 0.1762, "macro_f1": 0.3333333432674408, "num_tokens": 1082589.0, "repeat_count": 0.0, "routers_loss": 0.01050520222634077, "skip_count": 0.0, "step": 684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.731740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.0009988847218686796, "loss": 0.1527, "macro_f1": 0.3272727429866791, "num_tokens": 1085634.0, "repeat_count": 0.0, "routers_loss": 0.08884720504283905, "skip_count": 1.0, "step": 686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 3.7426621160409557, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5625, "learning_rate": 0.0009988604866134384, "loss": 0.196, "macro_f1": 0.29333335161209106, "num_tokens": 1088501.0, "repeat_count": 1.0, "routers_loss": 0.3627224862575531, "skip_count": 2.0, "step": 688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.75358361774744, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.9375, "learning_rate": 0.0009988359911623748, "loss": 0.2456, "macro_f1": 0.3272727429866791, "num_tokens": 1091083.0, "repeat_count": 0.0, "routers_loss": 0.025369791314005852, "skip_count": 0.0, "step": 690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.7645051194539247, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.46875, "learning_rate": 0.000998811235528266, "loss": 0.1186, "macro_f1": 0.3272727429866791, "num_tokens": 1095673.0, "repeat_count": 0.0, "routers_loss": 0.023373540490865707, "skip_count": 0.0, "step": 692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.7754266211604097, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.03125, "learning_rate": 0.0009987862197240237, "loss": 0.1518, "macro_f1": 0.3272727429866791, "num_tokens": 1098519.0, "repeat_count": 0.0, "routers_loss": 0.014006087556481361, "skip_count": 0.0, "step": 694, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.7863481228668943, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.59375, "learning_rate": 0.0009987609437626954, "loss": 0.2149, "macro_f1": 0.31446540355682373, "num_tokens": 1101510.0, "repeat_count": 0.0, "routers_loss": 0.057559430599212646, "skip_count": 1.0, "step": 696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.797269624573379, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.765625, "learning_rate": 0.0009987354076574648, "loss": 0.1507, "macro_f1": 0.3333333432674408, "num_tokens": 1104637.0, "repeat_count": 0.0, "routers_loss": 0.001837484072893858, "skip_count": 0.0, "step": 698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8081911262798633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.0009987096114216511, "loss": 0.1046, "macro_f1": 0.3272727429866791, "num_tokens": 1107964.0, "repeat_count": 0.0, "routers_loss": 0.3758608400821686, "skip_count": 1.0, "step": 700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 3.819112627986348, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 4.375, "learning_rate": 0.000998683555068709, "loss": 0.1269, "macro_f1": 0.5934640765190125, "num_tokens": 1111541.0, "repeat_count": 0.0, "routers_loss": 0.02019377611577511, "skip_count": 2.0, "step": 702, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.830034129692833, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.671875, "learning_rate": 0.000998657238612229, "loss": 0.1522, "macro_f1": 0.3272727429866791, "num_tokens": 1114819.0, "repeat_count": 0.0, "routers_loss": 0.019685756415128708, "skip_count": 0.0, "step": 704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8409556313993174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.484375, "learning_rate": 0.0009986306620659374, "loss": 0.1104, "macro_f1": 0.3333333432674408, "num_tokens": 1117888.0, "repeat_count": 0.0, "routers_loss": 0.0059326752088963985, "skip_count": 0.0, "step": 706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.851877133105802, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.0009986038254436956, "loss": 0.1038, "macro_f1": 0.32098764181137085, "num_tokens": 1120946.0, "repeat_count": 0.0, "routers_loss": 0.022552471607923508, "skip_count": 0.0, "step": 708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.862798634812287, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.625, "learning_rate": 0.0009985767287595015, "loss": 0.1433, "macro_f1": 0.4871794879436493, "num_tokens": 1124013.0, "repeat_count": 0.0, "routers_loss": 0.03914980590343475, "skip_count": 2.0, "step": 710, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.8737201365187715, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 5.0625, "learning_rate": 0.0009985493720274879, "loss": 0.1663, "macro_f1": 1.0, "num_tokens": 1127662.0, "repeat_count": 1.0, "routers_loss": 0.01359120849519968, "skip_count": 2.0, "step": 712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.884641638225256, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.96875, "learning_rate": 0.0009985217552619236, "loss": 0.1134, "macro_f1": 0.3272727429866791, "num_tokens": 1130742.0, "repeat_count": 0.0, "routers_loss": 0.0699341893196106, "skip_count": 0.0, "step": 714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8955631399317405, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.75, "learning_rate": 0.000998493878477213, "loss": 0.1643, "macro_f1": 0.3333333432674408, "num_tokens": 1133386.0, "repeat_count": 0.0, "routers_loss": 0.006396451499313116, "skip_count": 0.0, "step": 716, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 3.906484641638225, "f1_execute": 0.8292683362960815, "f1_repeat": 0.3333333432674408, "f1_skip": 0.6666666865348816, "grad_norm": 4.75, "learning_rate": 0.0009984657416878962, "loss": 0.1396, "macro_f1": 0.6097561120986938, "num_tokens": 1136071.0, "repeat_count": 3.0, "routers_loss": 0.23587316274642944, "skip_count": 6.0, "step": 718, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.91740614334471, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.625, "learning_rate": 0.0009984373449086485, "loss": 0.1686, "macro_f1": 0.3076923191547394, "num_tokens": 1139061.0, "repeat_count": 0.0, "routers_loss": 0.23841485381126404, "skip_count": 2.0, "step": 720, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.9283276450511946, "f1_execute": 0.9200000166893005, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 2.8125, "learning_rate": 0.0009984086881542815, "loss": 0.1112, "macro_f1": 0.5288889408111572, "num_tokens": 1141926.0, "repeat_count": 2.0, "routers_loss": 0.37492331862449646, "skip_count": 3.0, "step": 722, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.939249146757679, "f1_execute": 0.9166666865348816, "f1_repeat": 0.6666666865348816, "f1_skip": 0.4000000059604645, "grad_norm": 4.375, "learning_rate": 0.0009983797714397415, "loss": 0.1395, "macro_f1": 0.6611111164093018, "num_tokens": 1145302.0, "repeat_count": 2.0, "routers_loss": 0.5061943531036377, "skip_count": 2.0, "step": 724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.9501706484641637, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.5625, "learning_rate": 0.0009983505947801115, "loss": 0.327, "macro_f1": 0.3272727429866791, "num_tokens": 1148991.0, "repeat_count": 0.0, "routers_loss": 0.030050436034798622, "skip_count": 0.0, "step": 726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.961092150170648, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.84375, "learning_rate": 0.0009983211581906088, "loss": 0.2311, "macro_f1": 0.5492662787437439, "num_tokens": 1151711.0, "repeat_count": 0.0, "routers_loss": 0.04163246229290962, "skip_count": 2.0, "step": 728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.972013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.0009982914616865875, "loss": 0.1956, "macro_f1": 0.3333333432674408, "num_tokens": 1155061.0, "repeat_count": 0.0, "routers_loss": 0.002654903568327427, "skip_count": 0.0, "step": 730, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.9829351535836177, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009982615052835364, "loss": 0.1239, "macro_f1": 0.31446540355682373, "num_tokens": 1158043.0, "repeat_count": 0.0, "routers_loss": 0.18476539850234985, "skip_count": 2.0, "step": 732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.9938566552901023, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.625, "learning_rate": 0.0009982312889970804, "loss": 0.211, "macro_f1": 0.31446540355682373, "num_tokens": 1161487.0, "repeat_count": 2.0, "routers_loss": 0.33558642864227295, "skip_count": 0.0, "step": 734, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.0, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.5625, "learning_rate": 0.0009982008128429794, "loss": 0.14, "macro_f1": 0.3272727429866791, "num_tokens": 1163664.0, "repeat_count": 0.0, "routers_loss": 0.010565636679530144, "skip_count": 0.0, "step": 736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.010921501706485, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.75, "learning_rate": 0.0009981700768371296, "loss": 0.0823, "macro_f1": 0.3333333432674408, "num_tokens": 1166461.0, "repeat_count": 0.0, "routers_loss": 0.001561413868330419, "skip_count": 0.0, "step": 738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.021843003412969, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.125, "learning_rate": 0.000998139080995562, "loss": 0.1766, "macro_f1": 0.6666666865348816, "num_tokens": 1170134.0, "repeat_count": 0.0, "routers_loss": 0.010665918700397015, "skip_count": 2.0, "step": 740, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.032764505119454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.578125, "learning_rate": 0.0009981078253344432, "loss": 0.1177, "macro_f1": 0.3333333432674408, "num_tokens": 1173075.0, "repeat_count": 0.0, "routers_loss": 0.047345057129859924, "skip_count": 1.0, "step": 742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.043686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.1875, "learning_rate": 0.000998076309870076, "loss": 0.0517, "macro_f1": 0.6666666865348816, "num_tokens": 1176281.0, "repeat_count": 0.0, "routers_loss": 0.0033105311449617147, "skip_count": 1.0, "step": 744, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.054607508532423, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.000998044534618898, "loss": 0.0864, "macro_f1": 0.32098764181137085, "num_tokens": 1179403.0, "repeat_count": 0.0, "routers_loss": 0.033084314316511154, "skip_count": 0.0, "step": 746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.065529010238908, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.234375, "learning_rate": 0.0009980124995974827, "loss": 0.0925, "macro_f1": 0.3006536066532135, "num_tokens": 1182596.0, "repeat_count": 1.0, "routers_loss": 0.21827591955661774, "skip_count": 3.0, "step": 748, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 4.076450511945392, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.125, "learning_rate": 0.0009979802048225388, "loss": 0.1244, "macro_f1": 0.4871794879436493, "num_tokens": 1186303.0, "repeat_count": 0.0, "routers_loss": 0.18225915729999542, "skip_count": 3.0, "step": 750, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 4.087372013651877, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 3.984375, "learning_rate": 0.0009979476503109107, "loss": 0.0728, "macro_f1": 0.5492662787437439, "num_tokens": 1189299.0, "repeat_count": 1.0, "routers_loss": 0.03163563460111618, "skip_count": 0.0, "step": 752, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 4.098293515358362, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.34375, "learning_rate": 0.000997914836079578, "loss": 0.148, "macro_f1": 0.41777777671813965, "num_tokens": 1192694.0, "repeat_count": 0.0, "routers_loss": 0.28674715757369995, "skip_count": 2.0, "step": 754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.109215017064846, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.34375, "learning_rate": 0.0009978817621456562, "loss": 0.0869, "macro_f1": 0.31446540355682373, "num_tokens": 1196319.0, "repeat_count": 0.0, "routers_loss": 0.05852695554494858, "skip_count": 1.0, "step": 756, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.120136518771331, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.6484375, "learning_rate": 0.000997848428526396, "loss": 0.0648, "macro_f1": 0.5492662787437439, "num_tokens": 1199844.0, "repeat_count": 0.0, "routers_loss": 0.06834150850772858, "skip_count": 2.0, "step": 758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.131058020477815, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.375, "learning_rate": 0.0009978148352391835, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 1202876.0, "repeat_count": 0.0, "routers_loss": 0.0058227707631886005, "skip_count": 0.0, "step": 760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 4.1419795221843, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.296875, "learning_rate": 0.00099778098230154, "loss": 0.1094, "macro_f1": 0.4871794879436493, "num_tokens": 1206870.0, "repeat_count": 0.0, "routers_loss": 0.079805389046669, "skip_count": 3.0, "step": 762, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.152901023890785, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.0009977468697311232, "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 1209825.0, "repeat_count": 0.0, "routers_loss": 0.21695999801158905, "skip_count": 2.0, "step": 764, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.163822525597269, "f1_execute": 0.8749999403953552, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 3.265625, "learning_rate": 0.0009977124975457249, "loss": 0.1244, "macro_f1": 0.5138888955116272, "num_tokens": 1213093.0, "repeat_count": 2.0, "routers_loss": 0.12744387984275818, "skip_count": 4.0, "step": 766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 4.174744027303754, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.34375, "learning_rate": 0.0009976778657632733, "loss": 0.0783, "macro_f1": 0.5427350401878357, "num_tokens": 1216291.0, "repeat_count": 0.0, "routers_loss": 0.07573267817497253, "skip_count": 2.0, "step": 768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.1856655290102385, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.0009976429744018313, "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1219537.0, "repeat_count": 0.0, "routers_loss": 0.0009250715957023203, "skip_count": 0.0, "step": 770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.234375, "learning_rate": 0.0009976078234795983, "loss": 0.1114, "macro_f1": 0.3333333432674408, "num_tokens": 1222736.0, "repeat_count": 0.0, "routers_loss": 0.00175693747587502, "skip_count": 0.0, "step": 772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.390625, "learning_rate": 0.0009975724130149076, "loss": 0.0918, "macro_f1": 0.5492662787437439, "num_tokens": 1226120.0, "repeat_count": 0.0, "routers_loss": 0.027441009879112244, "skip_count": 2.0, "step": 774, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.2184300341296925, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009975367430262287, "loss": 0.0992, "macro_f1": 0.3272727429866791, "num_tokens": 1228810.0, "repeat_count": 0.0, "routers_loss": 0.027025407180190086, "skip_count": 0.0, "step": 776, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.2293515358361775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.453125, "learning_rate": 0.0009975008135321667, "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 1231669.0, "repeat_count": 0.0, "routers_loss": 0.00917113944888115, "skip_count": 0.0, "step": 778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.2402730375426625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.0009974646245514615, "loss": 0.0505, "macro_f1": 0.3333333432674408, "num_tokens": 1234476.0, "repeat_count": 0.0, "routers_loss": 0.010482276789844036, "skip_count": 0.0, "step": 780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.251194539249147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 4.28125, "learning_rate": 0.0009974281761029886, "loss": 0.0675, "macro_f1": 0.6666666865348816, "num_tokens": 1237748.0, "repeat_count": 0.0, "routers_loss": 0.009005382657051086, "skip_count": 1.0, "step": 782, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.262116040955632, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.9375, "learning_rate": 0.0009973914682057587, "loss": 0.1734, "macro_f1": 0.4871794879436493, "num_tokens": 1240362.0, "repeat_count": 0.0, "routers_loss": 0.09049399197101593, "skip_count": 2.0, "step": 784, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.273037542662116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.984375, "learning_rate": 0.0009973545008789182, "loss": 0.1156, "macro_f1": 0.3333333432674408, "num_tokens": 1244147.0, "repeat_count": 0.0, "routers_loss": 0.0037465172354131937, "skip_count": 0.0, "step": 786, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.283959044368601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000997317274141748, "loss": 0.1302, "macro_f1": 0.3333333432674408, "num_tokens": 1247058.0, "repeat_count": 0.0, "routers_loss": 0.002100529847666621, "skip_count": 0.0, "step": 788, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 4.294880546075086, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 3.03125, "learning_rate": 0.0009972797880136654, "loss": 0.0771, "macro_f1": 0.41777777671813965, "num_tokens": 1250331.0, "repeat_count": 0.0, "routers_loss": 0.08377297967672348, "skip_count": 2.0, "step": 790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.30580204778157, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.421875, "learning_rate": 0.0009972420425142224, "loss": 0.0782, "macro_f1": 0.4871794879436493, "num_tokens": 1253848.0, "repeat_count": 0.0, "routers_loss": 0.06583717465400696, "skip_count": 2.0, "step": 792, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.316723549488055, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.28125, "learning_rate": 0.0009972040376631057, "loss": 0.1235, "macro_f1": 0.32098767161369324, "num_tokens": 1257122.0, "repeat_count": 0.0, "routers_loss": 0.12353084981441498, "skip_count": 1.0, "step": 794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.327645051194539, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.171875, "learning_rate": 0.0009971657734801384, "loss": 0.0899, "macro_f1": 0.3333333432674408, "num_tokens": 1261136.0, "repeat_count": 0.0, "routers_loss": 0.004150724504143, "skip_count": 0.0, "step": 796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.338566552901024, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.0009971272499852784, "loss": 0.1815, "macro_f1": 0.3272727429866791, "num_tokens": 1264211.0, "repeat_count": 0.0, "routers_loss": 0.02800264209508896, "skip_count": 0.0, "step": 798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.349488054607509, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.125, "learning_rate": 0.0009970884671986187, "loss": 0.1118, "macro_f1": 0.5492662787437439, "num_tokens": 1266964.0, "repeat_count": 0.0, "routers_loss": 0.05382822826504707, "skip_count": 1.0, "step": 800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.360409556313993, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.1875, "learning_rate": 0.0009970494251403874, "loss": 0.1015, "macro_f1": 0.31446540355682373, "num_tokens": 1269856.0, "repeat_count": 0.0, "routers_loss": 0.20994320511817932, "skip_count": 2.0, "step": 802, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.371331058020478, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.000997010123830948, "loss": 0.1095, "macro_f1": 0.31446540355682373, "num_tokens": 1272945.0, "repeat_count": 0.0, "routers_loss": 0.07841377705335617, "skip_count": 1.0, "step": 804, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 4.382252559726963, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.0009969705632907999, "loss": 0.1242, "macro_f1": 0.6666666865348816, "num_tokens": 1276127.0, "repeat_count": 2.0, "routers_loss": 0.008330464363098145, "skip_count": 0.0, "step": 806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.393174061433447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.4375, "learning_rate": 0.0009969307435405766, "loss": 0.1688, "macro_f1": 0.3333333432674408, "num_tokens": 1279056.0, "repeat_count": 0.0, "routers_loss": 0.004059277940541506, "skip_count": 0.0, "step": 808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.404095563139932, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.3125, "learning_rate": 0.0009968906646010474, "loss": 0.1232, "macro_f1": 0.3333333432674408, "num_tokens": 1282092.0, "repeat_count": 0.0, "routers_loss": 0.005245010834187269, "skip_count": 0.0, "step": 810, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.415017064846416, "f1_execute": 0.9411765336990356, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 7.3125, "learning_rate": 0.0009968503264931167, "loss": 0.0964, "macro_f1": 0.6470588445663452, "num_tokens": 1285759.0, "repeat_count": 1.0, "routers_loss": 0.04135916382074356, "skip_count": 0.0, "step": 812, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.425938566552901, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0, "learning_rate": 0.0009968097292378244, "loss": 0.1636, "macro_f1": 0.32098767161369324, "num_tokens": 1288141.0, "repeat_count": 0.0, "routers_loss": 0.11239507049322128, "skip_count": 1.0, "step": 814, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.436860068259386, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.71875, "learning_rate": 0.0009967688728563446, "loss": 0.1044, "macro_f1": 0.32098767161369324, "num_tokens": 1291293.0, "repeat_count": 1.0, "routers_loss": 0.3831826150417328, "skip_count": 0.0, "step": 816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.44778156996587, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.875, "learning_rate": 0.0009967277573699875, "loss": 0.1445, "macro_f1": 0.32098764181137085, "num_tokens": 1293847.0, "repeat_count": 0.0, "routers_loss": 0.054437290877103806, "skip_count": 0.0, "step": 818, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.458703071672355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.859375, "learning_rate": 0.000996686382800198, "loss": 0.0712, "macro_f1": 0.3333333432674408, "num_tokens": 1296724.0, "repeat_count": 0.0, "routers_loss": 0.012091469950973988, "skip_count": 0.0, "step": 820, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 24.0, "epoch": 4.46962457337884, "f1_execute": 0.936170220375061, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 4.4375, "learning_rate": 0.000996644749168557, "loss": 0.1332, "macro_f1": 0.5620567798614502, "num_tokens": 1299674.0, "repeat_count": 1.0, "routers_loss": 0.06590834259986877, "skip_count": 4.0, "step": 822, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 4.480546075085324, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 3.265625, "learning_rate": 0.0009966028564967785, "loss": 0.1285, "macro_f1": 0.4400000274181366, "num_tokens": 1302843.0, "repeat_count": 1.0, "routers_loss": 0.06902799010276794, "skip_count": 2.0, "step": 824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 4.491467576791809, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 7.4375, "learning_rate": 0.0009965607048067137, "loss": 0.1249, "macro_f1": 0.44705885648727417, "num_tokens": 1305575.0, "repeat_count": 0.0, "routers_loss": 0.08320864289999008, "skip_count": 2.0, "step": 826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.502389078498293, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.65625, "learning_rate": 0.0009965182941203481, "loss": 0.1834, "macro_f1": 0.32098767161369324, "num_tokens": 1308244.0, "repeat_count": 0.0, "routers_loss": 0.12352414429187775, "skip_count": 1.0, "step": 828, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.513310580204778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.9375, "learning_rate": 0.0009964756244598021, "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 1311314.0, "repeat_count": 0.0, "routers_loss": 0.014358235523104668, "skip_count": 0.0, "step": 830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.524232081911263, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.65625, "learning_rate": 0.0009964326958473316, "loss": 0.102, "macro_f1": 0.3272727429866791, "num_tokens": 1315495.0, "repeat_count": 0.0, "routers_loss": 0.008667540736496449, "skip_count": 0.0, "step": 832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.535153583617747, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.000996389508305327, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 1319132.0, "repeat_count": 0.0, "routers_loss": 0.018217027187347412, "skip_count": 0.0, "step": 834, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.546075085324232, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.8125, "learning_rate": 0.000996346061856314, "loss": 0.2215, "macro_f1": 0.31446540355682373, "num_tokens": 1321294.0, "repeat_count": 0.0, "routers_loss": 0.1659325808286667, "skip_count": 1.0, "step": 836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.556996587030717, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.625, "learning_rate": 0.0009963023565229536, "loss": 0.1108, "macro_f1": 0.3272727429866791, "num_tokens": 1324186.0, "repeat_count": 0.0, "routers_loss": 0.11435546725988388, "skip_count": 0.0, "step": 838, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.567918088737201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.34375, "learning_rate": 0.0009962583923280419, "loss": 0.1153, "macro_f1": 0.3333333432674408, "num_tokens": 1327215.0, "repeat_count": 0.0, "routers_loss": 0.001215719268657267, "skip_count": 0.0, "step": 840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.578839590443686, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5625, "learning_rate": 0.0009962141692945092, "loss": 0.1181, "macro_f1": 0.3272727429866791, "num_tokens": 1330394.0, "repeat_count": 1.0, "routers_loss": 0.05636778846383095, "skip_count": 0.0, "step": 842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 4.58976109215017, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 5.53125, "learning_rate": 0.0009961696874454219, "loss": 0.0985, "macro_f1": 0.5934640765190125, "num_tokens": 1333840.0, "repeat_count": 0.0, "routers_loss": 0.17423874139785767, "skip_count": 2.0, "step": 844, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.600682593856655, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.375, "learning_rate": 0.0009961249468039806, "loss": 0.1442, "macro_f1": 0.3272727429866791, "num_tokens": 1337481.0, "repeat_count": 0.0, "routers_loss": 0.08344361186027527, "skip_count": 0.0, "step": 846, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.611604095563139, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.1875, "learning_rate": 0.0009960799473935212, "loss": 0.1287, "macro_f1": 0.29333335161209106, "num_tokens": 1340525.0, "repeat_count": 1.0, "routers_loss": 0.10816935449838638, "skip_count": 2.0, "step": 848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.622525597269624, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.703125, "learning_rate": 0.0009960346892375143, "loss": 0.1476, "macro_f1": 0.3272727429866791, "num_tokens": 1344963.0, "repeat_count": 0.0, "routers_loss": 0.02773604914546013, "skip_count": 0.0, "step": 850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.633447098976109, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.25, "learning_rate": 0.000995989172359566, "loss": 0.074, "macro_f1": 0.3144654333591461, "num_tokens": 1347911.0, "repeat_count": 0.0, "routers_loss": 0.07946910709142685, "skip_count": 3.0, "step": 852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.6443686006825935, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.5625, "learning_rate": 0.0009959433967834167, "loss": 0.0946, "macro_f1": 0.3272727429866791, "num_tokens": 1352093.0, "repeat_count": 0.0, "routers_loss": 0.20672957599163055, "skip_count": 1.0, "step": 854, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 4.6552901023890785, "f1_execute": 0.8780487775802612, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 3.109375, "learning_rate": 0.0009958973625329424, "loss": 0.1035, "macro_f1": 0.737127423286438, "num_tokens": 1355052.0, "repeat_count": 3.0, "routers_loss": 0.14273089170455933, "skip_count": 6.0, "step": 856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.6662116040955635, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.0009958510696321532, "loss": 0.1217, "macro_f1": 0.32098764181137085, "num_tokens": 1358739.0, "repeat_count": 0.0, "routers_loss": 0.03209677338600159, "skip_count": 0.0, "step": 858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.6771331058020476, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.609375, "learning_rate": 0.000995804518105195, "loss": 0.1511, "macro_f1": 0.3272727429866791, "num_tokens": 1361816.0, "repeat_count": 0.0, "routers_loss": 0.016142090782523155, "skip_count": 0.0, "step": 860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.6880546075085325, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.0009957577079763478, "loss": 0.1588, "macro_f1": 0.3333333432674408, "num_tokens": 1365188.0, "repeat_count": 0.0, "routers_loss": 0.005357397720217705, "skip_count": 0.0, "step": 862, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.6989761092150175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.53125, "learning_rate": 0.0009957106392700272, "loss": 0.0981, "macro_f1": 0.3333333432674408, "num_tokens": 1368207.0, "repeat_count": 0.0, "routers_loss": 0.005774896126240492, "skip_count": 0.0, "step": 864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.709897610921502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.000995663312010783, "loss": 0.1432, "macro_f1": 0.3333333432674408, "num_tokens": 1370949.0, "repeat_count": 0.0, "routers_loss": 0.0034105523955076933, "skip_count": 0.0, "step": 866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.720819112627987, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.6875, "learning_rate": 0.0009956157262233003, "loss": 0.1171, "macro_f1": 0.3272727429866791, "num_tokens": 1373855.0, "repeat_count": 0.0, "routers_loss": 0.00975721050053835, "skip_count": 0.0, "step": 868, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 4.731740614334471, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 9.8125, "learning_rate": 0.000995567881932399, "loss": 0.1658, "macro_f1": 0.4326530694961548, "num_tokens": 1376396.0, "repeat_count": 1.0, "routers_loss": 0.3017057776451111, "skip_count": 3.0, "step": 870, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.742662116040956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.5625, "learning_rate": 0.0009955197791630336, "loss": 0.141, "macro_f1": 0.3333333432674408, "num_tokens": 1379027.0, "repeat_count": 0.0, "routers_loss": 0.008239896968007088, "skip_count": 0.0, "step": 872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.753583617747441, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.0009954714179402936, "loss": 0.1144, "macro_f1": 0.3333333432674408, "num_tokens": 1382288.0, "repeat_count": 0.0, "routers_loss": 0.010364998131990433, "skip_count": 0.0, "step": 874, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.764505119453925, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.53125, "learning_rate": 0.0009954227982894035, "loss": 0.1795, "macro_f1": 0.5492662787437439, "num_tokens": 1385672.0, "repeat_count": 0.0, "routers_loss": 0.15057335793972015, "skip_count": 1.0, "step": 876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.77542662116041, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.90625, "learning_rate": 0.0009953739202357217, "loss": 0.1139, "macro_f1": 0.29333335161209106, "num_tokens": 1389206.0, "repeat_count": 1.0, "routers_loss": 0.42493173480033875, "skip_count": 3.0, "step": 878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.786348122866894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009953247838047428, "loss": 0.1882, "macro_f1": 0.3333333432674408, "num_tokens": 1392492.0, "repeat_count": 0.0, "routers_loss": 0.005968689452856779, "skip_count": 0.0, "step": 880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.797269624573379, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.046875, "learning_rate": 0.0009952753890220948, "loss": 0.1183, "macro_f1": 0.3272727429866791, "num_tokens": 1395478.0, "repeat_count": 0.0, "routers_loss": 0.14635904133319855, "skip_count": 1.0, "step": 882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 4.808191126279864, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.34375, "learning_rate": 0.0009952257359135417, "loss": 0.1388, "macro_f1": 0.3006536066532135, "num_tokens": 1398518.0, "repeat_count": 0.0, "routers_loss": 0.1135154739022255, "skip_count": 2.0, "step": 884, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 4.819112627986348, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.65625, "learning_rate": 0.0009951758245049808, "loss": 0.179, "macro_f1": 0.5359477400779724, "num_tokens": 1401259.0, "repeat_count": 0.0, "routers_loss": 0.18914444744586945, "skip_count": 1.0, "step": 886, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.830034129692833, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.5, "learning_rate": 0.0009951256548224455, "loss": 0.0913, "macro_f1": 0.6603773832321167, "num_tokens": 1404149.0, "repeat_count": 1.0, "routers_loss": 0.04007445275783539, "skip_count": 1.0, "step": 888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.840955631399318, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.15625, "learning_rate": 0.000995075226892103, "loss": 0.129, "macro_f1": 0.32098767161369324, "num_tokens": 1406960.0, "repeat_count": 0.0, "routers_loss": 0.4282263517379761, "skip_count": 1.0, "step": 890, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5714285969734192, "avg_layers": 27.0, "epoch": 4.851877133105802, "f1_execute": 0.8999999761581421, "f1_repeat": 0.800000011920929, "f1_skip": 0.7272727489471436, "grad_norm": 5.40625, "learning_rate": 0.0009950245407402557, "loss": 0.2196, "macro_f1": 0.8090909719467163, "num_tokens": 1409634.0, "repeat_count": 2.0, "routers_loss": 0.3470841348171234, "skip_count": 7.0, "step": 892, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 4.862798634812287, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.3125, "learning_rate": 0.0009949735963933404, "loss": 0.115, "macro_f1": 0.5487528443336487, "num_tokens": 1413390.0, "repeat_count": 1.0, "routers_loss": 0.05957069247961044, "skip_count": 2.0, "step": 894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.873720136518771, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.53125, "learning_rate": 0.0009949223938779286, "loss": 0.0754, "macro_f1": 0.3333333432674408, "num_tokens": 1416605.0, "repeat_count": 0.0, "routers_loss": 0.002007940784096718, "skip_count": 0.0, "step": 896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.884641638225256, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 8.1875, "learning_rate": 0.000994870933220727, "loss": 0.1282, "macro_f1": 0.4803921580314636, "num_tokens": 1420764.0, "repeat_count": 0.0, "routers_loss": 0.08513174206018448, "skip_count": 2.0, "step": 898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.895563139931741, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.0009948192144485757, "loss": 0.0972, "macro_f1": 0.32098767161369324, "num_tokens": 1424182.0, "repeat_count": 0.0, "routers_loss": 0.03853657469153404, "skip_count": 1.0, "step": 900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.906484641638225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.03125, "learning_rate": 0.0009947672375884506, "loss": 0.1737, "macro_f1": 0.6666666865348816, "num_tokens": 1426986.0, "repeat_count": 0.0, "routers_loss": 0.008192243054509163, "skip_count": 1.0, "step": 902, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.91740614334471, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 4.875, "learning_rate": 0.0009947150026674621, "loss": 0.0577, "macro_f1": 0.9265305995941162, "num_tokens": 1429981.0, "repeat_count": 1.0, "routers_loss": 0.06954901665449142, "skip_count": 2.0, "step": 904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.928327645051194, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.234375, "learning_rate": 0.0009946625097128543, "loss": 0.168, "macro_f1": 0.32098767161369324, "num_tokens": 1432902.0, "repeat_count": 0.0, "routers_loss": 0.0880909413099289, "skip_count": 1.0, "step": 906, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.939249146757679, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.9921875, "learning_rate": 0.000994609758752007, "loss": 0.1445, "macro_f1": 0.3272727429866791, "num_tokens": 1436788.0, "repeat_count": 1.0, "routers_loss": 0.5064544081687927, "skip_count": 0.0, "step": 908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.950170648464164, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.0625, "learning_rate": 0.0009945567498124339, "loss": 0.1658, "macro_f1": 0.5492662787437439, "num_tokens": 1439507.0, "repeat_count": 0.0, "routers_loss": 0.019065011292696, "skip_count": 2.0, "step": 910, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.961092150170648, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.40625, "learning_rate": 0.0009945034829217832, "loss": 0.0968, "macro_f1": 0.3272727429866791, "num_tokens": 1442860.0, "repeat_count": 0.0, "routers_loss": 0.018776487559080124, "skip_count": 0.0, "step": 912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.972013651877133, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.0009944499581078382, "loss": 0.1252, "macro_f1": 0.3076923191547394, "num_tokens": 1446637.0, "repeat_count": 0.0, "routers_loss": 0.1531504988670349, "skip_count": 2.0, "step": 914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 4.982935153583618, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.21875, "learning_rate": 0.000994396175398516, "loss": 0.0992, "macro_f1": 0.3144654333591461, "num_tokens": 1450238.0, "repeat_count": 0.0, "routers_loss": 0.1735955774784088, "skip_count": 0.0, "step": 916, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.993856655290102, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.6875, "learning_rate": 0.000994342134821869, "loss": 0.1523, "macro_f1": 0.3272727429866791, "num_tokens": 1453160.0, "repeat_count": 0.0, "routers_loss": 0.15269255638122559, "skip_count": 0.0, "step": 918, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.0, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.4375, "learning_rate": 0.0009942878364060837, "loss": 0.1131, "macro_f1": 0.31446540355682373, "num_tokens": 1454580.0, "repeat_count": 1.0, "routers_loss": 0.2639358341693878, "skip_count": 0.0, "step": 920, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.010921501706485, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.09375, "learning_rate": 0.0009942332801794807, "loss": 0.1702, "macro_f1": 0.6601307392120361, "num_tokens": 1457292.0, "repeat_count": 0.0, "routers_loss": 0.043732915073633194, "skip_count": 2.0, "step": 922, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 5.021843003412969, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.8125, "learning_rate": 0.000994178466170516, "loss": 0.1107, "macro_f1": 0.6538461446762085, "num_tokens": 1460434.0, "repeat_count": 1.0, "routers_loss": 0.36936479806900024, "skip_count": 1.0, "step": 924, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.032764505119454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.09375, "learning_rate": 0.0009941233944077788, "loss": 0.0547, "macro_f1": 0.6666666865348816, "num_tokens": 1463373.0, "repeat_count": 0.0, "routers_loss": 0.0019650806207209826, "skip_count": 1.0, "step": 926, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.043686006825938, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.125, "learning_rate": 0.000994068064919994, "loss": 0.0665, "macro_f1": 0.32098764181137085, "num_tokens": 1466927.0, "repeat_count": 1.0, "routers_loss": 0.06489580124616623, "skip_count": 1.0, "step": 928, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.054607508532423, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.203125, "learning_rate": 0.0009940124777360203, "loss": 0.0898, "macro_f1": 0.3272727429866791, "num_tokens": 1469834.0, "repeat_count": 0.0, "routers_loss": 0.013250669464468956, "skip_count": 0.0, "step": 930, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.065529010238908, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.421875, "learning_rate": 0.0009939566328848507, "loss": 0.0616, "macro_f1": 0.3272727429866791, "num_tokens": 1472714.0, "repeat_count": 0.0, "routers_loss": 0.03642500564455986, "skip_count": 1.0, "step": 932, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.076450511945392, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.015625, "learning_rate": 0.000993900530395613, "loss": 0.0672, "macro_f1": 0.5492662787437439, "num_tokens": 1476458.0, "repeat_count": 0.0, "routers_loss": 0.019950609654188156, "skip_count": 2.0, "step": 934, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.087372013651877, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.34375, "learning_rate": 0.0009938441702975688, "loss": 0.0714, "macro_f1": 0.5492662787437439, "num_tokens": 1479499.0, "repeat_count": 0.0, "routers_loss": 0.05769496411085129, "skip_count": 2.0, "step": 936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.098293515358362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.09375, "learning_rate": 0.000993787552620115, "loss": 0.0647, "macro_f1": 0.6666666865348816, "num_tokens": 1482112.0, "repeat_count": 0.0, "routers_loss": 0.006518410053104162, "skip_count": 2.0, "step": 938, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.109215017064846, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.359375, "learning_rate": 0.0009937306773927816, "loss": 0.0569, "macro_f1": 0.5492662787437439, "num_tokens": 1485128.0, "repeat_count": 0.0, "routers_loss": 0.16481046378612518, "skip_count": 2.0, "step": 940, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.120136518771331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.375, "learning_rate": 0.0009936735446452341, "loss": 0.0689, "macro_f1": 0.3333333432674408, "num_tokens": 1487854.0, "repeat_count": 0.0, "routers_loss": 0.00462290458381176, "skip_count": 0.0, "step": 942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.131058020477815, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.0625, "learning_rate": 0.0009936161544072716, "loss": 0.0596, "macro_f1": 0.3333333432674408, "num_tokens": 1490795.0, "repeat_count": 0.0, "routers_loss": 0.0042699906043708324, "skip_count": 0.0, "step": 944, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.1419795221843, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.78125, "learning_rate": 0.0009935585067088275, "loss": 0.1091, "macro_f1": 0.5492662787437439, "num_tokens": 1494150.0, "repeat_count": 0.0, "routers_loss": 0.01713154837489128, "skip_count": 2.0, "step": 946, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.152901023890785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.640625, "learning_rate": 0.0009935006015799703, "loss": 0.0893, "macro_f1": 0.3333333432674408, "num_tokens": 1497517.0, "repeat_count": 0.0, "routers_loss": 0.014775852672755718, "skip_count": 0.0, "step": 948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.163822525597269, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.828125, "learning_rate": 0.0009934424390509017, "loss": 0.1128, "macro_f1": 0.32098767161369324, "num_tokens": 1500944.0, "repeat_count": 0.0, "routers_loss": 0.08066675066947937, "skip_count": 1.0, "step": 950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 5.174744027303754, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.421875, "learning_rate": 0.0009933840191519584, "loss": 0.0536, "macro_f1": 0.44705885648727417, "num_tokens": 1504267.0, "repeat_count": 0.0, "routers_loss": 0.10788286477327347, "skip_count": 4.0, "step": 952, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 5.1856655290102385, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 2.1875, "learning_rate": 0.0009933253419136107, "loss": 0.0582, "macro_f1": 0.8200000524520874, "num_tokens": 1507688.0, "repeat_count": 1.0, "routers_loss": 0.088263139128685, "skip_count": 3.0, "step": 954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000993266407366464, "loss": 0.0989, "macro_f1": 0.3333333432674408, "num_tokens": 1510658.0, "repeat_count": 0.0, "routers_loss": 0.005081284325569868, "skip_count": 0.0, "step": 956, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.59375, "learning_rate": 0.000993207215541257, "loss": 0.0562, "macro_f1": 0.5492662787437439, "num_tokens": 1515152.0, "repeat_count": 0.0, "routers_loss": 0.025190535932779312, "skip_count": 2.0, "step": 958, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.2184300341296925, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.3125, "learning_rate": 0.000993147766468863, "loss": 0.0672, "macro_f1": 0.6666666865348816, "num_tokens": 1518790.0, "repeat_count": 1.0, "routers_loss": 0.007869229651987553, "skip_count": 0.0, "step": 960, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.2293515358361775, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.609375, "learning_rate": 0.0009930880601802898, "loss": 0.0658, "macro_f1": 0.5427350401878357, "num_tokens": 1522153.0, "repeat_count": 1.0, "routers_loss": 0.15375611186027527, "skip_count": 2.0, "step": 962, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.2402730375426625, "f1_execute": 0.8444444537162781, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 5.15625, "learning_rate": 0.0009930280967066787, "loss": 0.1698, "macro_f1": 0.5481481552124023, "num_tokens": 1525054.0, "repeat_count": 3.0, "routers_loss": 0.3285106122493744, "skip_count": 4.0, "step": 964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.251194539249147, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.71875, "learning_rate": 0.0009929678760793057, "loss": 0.0853, "macro_f1": 0.4871794879436493, "num_tokens": 1528654.0, "repeat_count": 0.0, "routers_loss": 0.06668563932180405, "skip_count": 2.0, "step": 966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.262116040955632, "f1_execute": 0.9166666865348816, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.734375, "learning_rate": 0.0009929073983295804, "loss": 0.0927, "macro_f1": 0.5277777910232544, "num_tokens": 1531379.0, "repeat_count": 2.0, "routers_loss": 0.2843759059906006, "skip_count": 4.0, "step": 968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.273037542662116, "f1_execute": 0.936170220375061, "f1_repeat": 0.0, "f1_skip": 0.5714285373687744, "grad_norm": 2.265625, "learning_rate": 0.0009928466634890473, "loss": 0.0759, "macro_f1": 0.502532958984375, "num_tokens": 1534519.0, "repeat_count": 1.0, "routers_loss": 0.061425577849149704, "skip_count": 4.0, "step": 970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.283959044368601, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.859375, "learning_rate": 0.0009927856715893839, "loss": 0.1502, "macro_f1": 0.4871794879436493, "num_tokens": 1537641.0, "repeat_count": 0.0, "routers_loss": 0.12876227498054504, "skip_count": 2.0, "step": 972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.294880546075086, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.703125, "learning_rate": 0.0009927244226624029, "loss": 0.0589, "macro_f1": 0.4803921580314636, "num_tokens": 1540885.0, "repeat_count": 1.0, "routers_loss": 0.24013344943523407, "skip_count": 2.0, "step": 974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.30580204778157, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.578125, "learning_rate": 0.00099266291674005, "loss": 0.1553, "macro_f1": 0.6666666865348816, "num_tokens": 1545093.0, "repeat_count": 0.0, "routers_loss": 0.008588392287492752, "skip_count": 1.0, "step": 976, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.316723549488055, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0625, "learning_rate": 0.000992601153854406, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1547669.0, "repeat_count": 0.0, "routers_loss": 0.1047874391078949, "skip_count": 1.0, "step": 978, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.327645051194539, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.15625, "learning_rate": 0.000992539134037685, "loss": 0.1686, "macro_f1": 0.2857142984867096, "num_tokens": 1550684.0, "repeat_count": 1.0, "routers_loss": 0.3830685019493103, "skip_count": 2.0, "step": 980, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.338566552901024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.0009924768573222353, "loss": 0.0979, "macro_f1": 0.3333333432674408, "num_tokens": 1553458.0, "repeat_count": 0.0, "routers_loss": 0.0034001434687525034, "skip_count": 0.0, "step": 982, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.349488054607509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.515625, "learning_rate": 0.0009924143237405392, "loss": 0.0553, "macro_f1": 0.3333333432674408, "num_tokens": 1557067.0, "repeat_count": 0.0, "routers_loss": 0.0015051440568640828, "skip_count": 0.0, "step": 984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 5.360409556313993, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.375, "learning_rate": 0.0009923515333252128, "loss": 0.0821, "macro_f1": 0.3006536066532135, "num_tokens": 1560210.0, "repeat_count": 0.0, "routers_loss": 0.38080108165740967, "skip_count": 2.0, "step": 986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.371331058020478, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.34375, "learning_rate": 0.0009922884861090068, "loss": 0.104, "macro_f1": 0.5359477400779724, "num_tokens": 1563164.0, "repeat_count": 1.0, "routers_loss": 0.15402451157569885, "skip_count": 1.0, "step": 988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.382252559726963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.6875, "learning_rate": 0.0009922251821248053, "loss": 0.0596, "macro_f1": 0.3333333432674408, "num_tokens": 1566178.0, "repeat_count": 0.0, "routers_loss": 0.0008378620259463787, "skip_count": 0.0, "step": 990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.393174061433447, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5, "learning_rate": 0.0009921616214056258, "loss": 0.0858, "macro_f1": 0.3272727429866791, "num_tokens": 1568705.0, "repeat_count": 0.0, "routers_loss": 0.1363816112279892, "skip_count": 1.0, "step": 992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.404095563139932, "f1_execute": 0.9166666865348816, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.125, "learning_rate": 0.000992097803984621, "loss": 0.0683, "macro_f1": 0.5277777910232544, "num_tokens": 1571934.0, "repeat_count": 2.0, "routers_loss": 0.15122386813163757, "skip_count": 4.0, "step": 994, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.415017064846416, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.328125, "learning_rate": 0.0009920337298950765, "loss": 0.12, "macro_f1": 0.6538461446762085, "num_tokens": 1574947.0, "repeat_count": 1.0, "routers_loss": 0.16266369819641113, "skip_count": 1.0, "step": 996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3203125, "learning_rate": 0.0009919693991704123, "loss": 0.0627, "macro_f1": 0.3333333432674408, "num_tokens": 1577895.0, "repeat_count": 0.0, "routers_loss": 0.002958054654300213, "skip_count": 0.0, "step": 998, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.436860068259386, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.703125, "learning_rate": 0.0009919048118441818, "loss": 0.1173, "macro_f1": 0.5492662787437439, "num_tokens": 1581513.0, "repeat_count": 0.0, "routers_loss": 0.08616811782121658, "skip_count": 2.0, "step": 1000, "text_loss": 0.0 } ], "logging_steps": 2, "max_steps": 9200, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.7215681060599736e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }