{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.873720136518772, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.010921501706484642, "f1_execute": 0.5142857432365417, "f1_repeat": 0.2222222238779068, "f1_skip": 0.0, "grad_norm": 31.125, "learning_rate": 2e-06, "loss": 2.8198, "macro_f1": 0.24550265073776245, "num_tokens": 3507.0, "repeat_count": 1.0, "routers_loss": 1.076732873916626, "skip_count": 2.0, "step": 2, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.021843003412969283, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 38.5, "learning_rate": 6e-06, "loss": 3.125, "macro_f1": 0.222222238779068, "num_tokens": 7330.0, "repeat_count": 0.0, "routers_loss": 4.3143134117126465, "skip_count": 0.0, "step": 4, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.032764505119453925, "f1_execute": 0.5999999642372131, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 33.75, "learning_rate": 1e-05, "loss": 3.0713, "macro_f1": 0.19999998807907104, "num_tokens": 11360.0, "repeat_count": 0.0, "routers_loss": 1.8818678855895996, "skip_count": 0.0, "step": 6, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.04368600682593857, "f1_execute": 0.5789473652839661, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 37.25, "learning_rate": 1.4e-05, "loss": 2.992, "macro_f1": 0.19298246502876282, "num_tokens": 14241.0, "repeat_count": 1.0, "routers_loss": 2.340613603591919, "skip_count": 1.0, "step": 8, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.05460750853242321, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 34.5, "learning_rate": 1.8e-05, "loss": 3.0072, "macro_f1": 0.222222238779068, "num_tokens": 17520.0, "repeat_count": 0.0, "routers_loss": 1.7916433811187744, "skip_count": 0.0, "step": 10, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.06552901023890785, "f1_execute": 0.6315789818763733, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 41.25, "learning_rate": 2.2e-05, "loss": 3.2227, "macro_f1": 0.21052633225917816, "num_tokens": 20401.0, "repeat_count": 1.0, "routers_loss": 2.2361459732055664, "skip_count": 1.0, "step": 12, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 0.07645051194539249, "f1_execute": 0.5789473652839661, "f1_repeat": 0.0, "f1_skip": 0.20000000298023224, "grad_norm": 31.875, "learning_rate": 2.6e-05, "loss": 3.1809, "macro_f1": 0.2596491277217865, "num_tokens": 23722.0, "repeat_count": 1.0, "routers_loss": 2.6635637283325195, "skip_count": 2.0, "step": 14, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.08737201365187713, "f1_execute": 0.6341463327407837, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 40.25, "learning_rate": 3e-05, "loss": 3.2606, "macro_f1": 0.21138212084770203, "num_tokens": 26754.0, "repeat_count": 0.0, "routers_loss": 1.967104196548462, "skip_count": 0.0, "step": 16, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 0.09829351535836177, "f1_execute": 0.5405405163764954, "f1_repeat": 0.0, "f1_skip": 0.1666666567325592, "grad_norm": 39.5, "learning_rate": 3.4000000000000007e-05, "loss": 2.9096, "macro_f1": 0.23573574423789978, "num_tokens": 29878.0, "repeat_count": 0.0, "routers_loss": 0.6965824365615845, "skip_count": 2.0, "step": 18, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.10921501706484642, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 40.75, "learning_rate": 3.8e-05, "loss": 3.2996, "macro_f1": 0.222222238779068, "num_tokens": 32410.0, "repeat_count": 0.0, "routers_loss": 7.038887977600098, "skip_count": 0.0, "step": 20, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.12013651877133105, "f1_execute": 0.5641025900840759, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 32.5, "learning_rate": 4.2000000000000004e-05, "loss": 2.7437, "macro_f1": 0.18803420662879944, "num_tokens": 35122.0, "repeat_count": 1.0, "routers_loss": 4.3931450843811035, "skip_count": 2.0, "step": 22, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.1310580204778157, "f1_execute": 0.6341463327407837, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 44.0, "learning_rate": 4.6e-05, "loss": 2.9583, "macro_f1": 0.21138212084770203, "num_tokens": 38647.0, "repeat_count": 0.0, "routers_loss": 5.246743202209473, "skip_count": 2.0, "step": 24, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 0.14197952218430035, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 36.0, "learning_rate": 5e-05, "loss": 2.0258, "macro_f1": 0.222222238779068, "num_tokens": 41759.0, "repeat_count": 0.0, "routers_loss": 4.385664463043213, "skip_count": 0.0, "step": 26, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.15290102389078497, "f1_execute": 0.6666666865348816, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 22.75, "learning_rate": 5.4e-05, "loss": 1.8932, "macro_f1": 0.222222238779068, "num_tokens": 45255.0, "repeat_count": 1.0, "routers_loss": 2.442974090576172, "skip_count": 2.0, "step": 28, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.16382252559726962, "f1_execute": 0.7272726893424988, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 20.5, "learning_rate": 5.800000000000001e-05, "loss": 1.5961, "macro_f1": 0.24242423474788666, "num_tokens": 48765.0, "repeat_count": 0.0, "routers_loss": 1.319467306137085, "skip_count": 3.0, "step": 30, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.17474402730375427, "f1_execute": 0.782608687877655, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 21.875, "learning_rate": 6.2e-05, "loss": 1.7529, "macro_f1": 0.260869562625885, "num_tokens": 51973.0, "repeat_count": 0.0, "routers_loss": 1.2047386169433594, "skip_count": 2.0, "step": 32, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.18566552901023892, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 26.875, "learning_rate": 6.6e-05, "loss": 1.4983, "macro_f1": 0.29333335161209106, "num_tokens": 54972.0, "repeat_count": 0.0, "routers_loss": 0.8216792345046997, "skip_count": 0.0, "step": 34, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.19658703071672354, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 20.75, "learning_rate": 7.000000000000001e-05, "loss": 1.2751, "macro_f1": 0.3076923191547394, "num_tokens": 58134.0, "repeat_count": 0.0, "routers_loss": 0.6534898281097412, "skip_count": 0.0, "step": 36, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2075085324232082, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 17.75, "learning_rate": 7.4e-05, "loss": 0.9561, "macro_f1": 0.29333335161209106, "num_tokens": 61291.0, "repeat_count": 0.0, "routers_loss": 0.6772168278694153, "skip_count": 2.0, "step": 38, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.21843003412969283, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.875, "learning_rate": 7.8e-05, "loss": 0.6809, "macro_f1": 0.307692289352417, "num_tokens": 64406.0, "repeat_count": 0.0, "routers_loss": 0.7885609865188599, "skip_count": 1.0, "step": 40, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.22935153583617748, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 17.0, "learning_rate": 8.2e-05, "loss": 0.587, "macro_f1": 0.3205128312110901, "num_tokens": 67402.0, "repeat_count": 1.0, "routers_loss": 0.31721553206443787, "skip_count": 0.0, "step": 42, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2402730375426621, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.625, "learning_rate": 8.599999999999999e-05, "loss": 0.4996, "macro_f1": 0.32098764181137085, "num_tokens": 70935.0, "repeat_count": 0.0, "routers_loss": 0.13094936311244965, "skip_count": 0.0, "step": 44, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.25119453924914675, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.5625, "learning_rate": 8.999999999999999e-05, "loss": 0.4226, "macro_f1": 0.29333335161209106, "num_tokens": 73716.0, "repeat_count": 2.0, "routers_loss": 0.48597365617752075, "skip_count": 3.0, "step": 46, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.2621160409556314, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.1875, "learning_rate": 9.400000000000001e-05, "loss": 0.2499, "macro_f1": 0.31446540355682373, "num_tokens": 76662.0, "repeat_count": 0.0, "routers_loss": 0.7850716710090637, "skip_count": 1.0, "step": 48, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.27303754266211605, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5625, "learning_rate": 9.800000000000001e-05, "loss": 0.3029, "macro_f1": 0.3144654333591461, "num_tokens": 80080.0, "repeat_count": 2.0, "routers_loss": 1.4728330373764038, "skip_count": 1.0, "step": 50, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.2839590443686007, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0625, "learning_rate": 0.000102, "loss": 0.2549, "macro_f1": 0.32098764181137085, "num_tokens": 82942.0, "repeat_count": 0.0, "routers_loss": 0.16784702241420746, "skip_count": 2.0, "step": 52, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.29488054607508535, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.000106, "loss": 0.2782, "macro_f1": 0.2857142686843872, "num_tokens": 85928.0, "repeat_count": 1.0, "routers_loss": 0.25518977642059326, "skip_count": 4.0, "step": 54, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.30580204778156994, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.75, "learning_rate": 0.00011, "loss": 0.2309, "macro_f1": 0.307692289352417, "num_tokens": 88804.0, "repeat_count": 0.0, "routers_loss": 0.21613653004169464, "skip_count": 3.0, "step": 56, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.3167235494880546, "f1_execute": 0.8571429252624512, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.625, "learning_rate": 0.000114, "loss": 0.1319, "macro_f1": 0.285714328289032, "num_tokens": 91674.0, "repeat_count": 1.0, "routers_loss": 0.4971294403076172, "skip_count": 5.0, "step": 58, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.32764505119453924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.000118, "loss": 0.1637, "macro_f1": 0.3333333432674408, "num_tokens": 94858.0, "repeat_count": 0.0, "routers_loss": 0.01838197372853756, "skip_count": 0.0, "step": 60, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.3385665529010239, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.000122, "loss": 0.1888, "macro_f1": 0.31446540355682373, "num_tokens": 97538.0, "repeat_count": 1.0, "routers_loss": 0.5383598804473877, "skip_count": 1.0, "step": 62, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 0.34948805460750854, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.000126, "loss": 0.2176, "macro_f1": 0.2857142686843872, "num_tokens": 101249.0, "repeat_count": 1.0, "routers_loss": 0.2093856781721115, "skip_count": 1.0, "step": 64, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3604095563139932, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.00013000000000000002, "loss": 0.1568, "macro_f1": 0.3333333432674408, "num_tokens": 104398.0, "repeat_count": 0.0, "routers_loss": 0.015723152086138725, "skip_count": 0.0, "step": 66, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.37133105802047783, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.000134, "loss": 0.2764, "macro_f1": 0.3333333432674408, "num_tokens": 107538.0, "repeat_count": 0.0, "routers_loss": 0.019146224483847618, "skip_count": 0.0, "step": 68, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.3822525597269625, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.25, "learning_rate": 0.00013800000000000002, "loss": 0.2035, "macro_f1": 0.3144654333591461, "num_tokens": 110689.0, "repeat_count": 3.0, "routers_loss": 0.6408394575119019, "skip_count": 0.0, "step": 70, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.3931740614334471, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.6875, "learning_rate": 0.00014199999999999998, "loss": 0.1986, "macro_f1": 0.32098764181137085, "num_tokens": 114205.0, "repeat_count": 0.0, "routers_loss": 0.04342689737677574, "skip_count": 0.0, "step": 72, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.4040955631399317, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0625, "learning_rate": 0.000146, "loss": 0.1412, "macro_f1": 0.307692289352417, "num_tokens": 117140.0, "repeat_count": 0.0, "routers_loss": 0.12777170538902283, "skip_count": 1.0, "step": 74, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.4150170648464164, "f1_execute": 0.8571428060531616, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.5, "learning_rate": 0.00015, "loss": 0.1273, "macro_f1": 0.2857142686843872, "num_tokens": 120355.0, "repeat_count": 0.0, "routers_loss": 0.2570268511772156, "skip_count": 5.0, "step": 76, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.03125, "learning_rate": 0.000154, "loss": 0.1169, "macro_f1": 0.3333333432674408, "num_tokens": 123542.0, "repeat_count": 0.0, "routers_loss": 0.019178830087184906, "skip_count": 0.0, "step": 78, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.43686006825938567, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.000158, "loss": 0.1702, "macro_f1": 0.3006536066532135, "num_tokens": 126444.0, "repeat_count": 0.0, "routers_loss": 0.40678197145462036, "skip_count": 4.0, "step": 80, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.4477815699658703, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.40625, "learning_rate": 0.000162, "loss": 0.207, "macro_f1": 0.3333333432674408, "num_tokens": 129208.0, "repeat_count": 0.0, "routers_loss": 0.016020173206925392, "skip_count": 0.0, "step": 82, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.45870307167235497, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0, "learning_rate": 0.00016600000000000002, "loss": 0.1469, "macro_f1": 0.3333333432674408, "num_tokens": 132692.0, "repeat_count": 0.0, "routers_loss": 0.015191584825515747, "skip_count": 0.0, "step": 84, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.4696245733788396, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.9375, "learning_rate": 0.00017, "loss": 0.1883, "macro_f1": 0.307692289352417, "num_tokens": 135433.0, "repeat_count": 1.0, "routers_loss": 0.29757800698280334, "skip_count": 2.0, "step": 86, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.1111111119389534, "avg_layers": 27.0, "epoch": 0.4805460750853242, "f1_execute": 0.7142857313156128, "f1_repeat": 0.0, "f1_skip": 0.1818181872367859, "grad_norm": 4.21875, "learning_rate": 0.000174, "loss": 0.2656, "macro_f1": 0.29870131611824036, "num_tokens": 139019.0, "repeat_count": 2.0, "routers_loss": 0.5406635403633118, "skip_count": 9.0, "step": 88, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.49146757679180886, "f1_execute": 0.8571429252624512, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.000178, "loss": 0.2149, "macro_f1": 0.285714328289032, "num_tokens": 142156.0, "repeat_count": 3.0, "routers_loss": 0.9084331393241882, "skip_count": 3.0, "step": 90, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 0.5023890784982935, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.15625, "learning_rate": 0.000182, "loss": 0.1461, "macro_f1": 0.4104308784008026, "num_tokens": 144866.0, "repeat_count": 1.0, "routers_loss": 0.298293799161911, "skip_count": 3.0, "step": 92, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5133105802047782, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.59375, "learning_rate": 0.000186, "loss": 0.1432, "macro_f1": 0.32098764181137085, "num_tokens": 148029.0, "repeat_count": 1.0, "routers_loss": 0.13971005380153656, "skip_count": 1.0, "step": 94, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5242320819112628, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.00019, "loss": 0.1566, "macro_f1": 0.32098764181137085, "num_tokens": 151076.0, "repeat_count": 0.0, "routers_loss": 0.2203323394060135, "skip_count": 2.0, "step": 96, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5351535836177475, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.25, "learning_rate": 0.000194, "loss": 0.3221, "macro_f1": 0.32098764181137085, "num_tokens": 153825.0, "repeat_count": 0.0, "routers_loss": 0.22957128286361694, "skip_count": 2.0, "step": 98, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.5460750853242321, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.00019800000000000002, "loss": 0.1445, "macro_f1": 0.3272727429866791, "num_tokens": 157200.0, "repeat_count": 0.0, "routers_loss": 0.0985352173447609, "skip_count": 0.0, "step": 100, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5569965870307167, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.000202, "loss": 0.2346, "macro_f1": 0.3144654333591461, "num_tokens": 161171.0, "repeat_count": 1.0, "routers_loss": 0.5728805065155029, "skip_count": 2.0, "step": 102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 0.5679180887372014, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 4.65625, "learning_rate": 0.000206, "loss": 0.1532, "macro_f1": 0.4871794879436493, "num_tokens": 165319.0, "repeat_count": 0.0, "routers_loss": 0.08763546496629715, "skip_count": 2.0, "step": 104, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.578839590443686, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.375, "learning_rate": 0.00021, "loss": 0.1183, "macro_f1": 0.3272727429866791, "num_tokens": 168259.0, "repeat_count": 0.0, "routers_loss": 0.11700262129306793, "skip_count": 1.0, "step": 106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.5897610921501707, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.1875, "learning_rate": 0.000214, "loss": 0.1856, "macro_f1": 0.3144654333591461, "num_tokens": 171640.0, "repeat_count": 1.0, "routers_loss": 0.2897156774997711, "skip_count": 2.0, "step": 108, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.6006825938566553, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000218, "loss": 0.1379, "macro_f1": 0.3006536066532135, "num_tokens": 174452.0, "repeat_count": 0.0, "routers_loss": 0.20764203369617462, "skip_count": 4.0, "step": 110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6116040955631399, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.9375, "learning_rate": 0.000222, "loss": 0.14, "macro_f1": 0.32098764181137085, "num_tokens": 177034.0, "repeat_count": 0.0, "routers_loss": 0.07773401588201523, "skip_count": 0.0, "step": 112, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.6225255972696245, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.375, "learning_rate": 0.00022600000000000002, "loss": 0.1327, "macro_f1": 0.2857142984867096, "num_tokens": 180310.0, "repeat_count": 2.0, "routers_loss": 0.3696478605270386, "skip_count": 2.0, "step": 114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6334470989761092, "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.984375, "learning_rate": 0.00023, "loss": 0.155, "macro_f1": 0.2777777910232544, "num_tokens": 182835.0, "repeat_count": 3.0, "routers_loss": 0.5024136304855347, "skip_count": 5.0, "step": 116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6443686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.00023400000000000002, "loss": 0.1566, "macro_f1": 0.3333333432674408, "num_tokens": 186508.0, "repeat_count": 0.0, "routers_loss": 0.02631981112062931, "skip_count": 0.0, "step": 118, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.6552901023890785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.00023799999999999998, "loss": 0.1503, "macro_f1": 0.32098764181137085, "num_tokens": 190380.0, "repeat_count": 0.0, "routers_loss": 0.036612559109926224, "skip_count": 0.0, "step": 120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.6662116040955631, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.28125, "learning_rate": 0.000242, "loss": 0.181, "macro_f1": 0.3076923191547394, "num_tokens": 193279.0, "repeat_count": 1.0, "routers_loss": 0.37753066420555115, "skip_count": 1.0, "step": 122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.6771331058020478, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.75, "learning_rate": 0.000246, "loss": 0.1187, "macro_f1": 0.32098767161369324, "num_tokens": 196711.0, "repeat_count": 0.0, "routers_loss": 0.08419940620660782, "skip_count": 1.0, "step": 124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 0.6880546075085324, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.0, "learning_rate": 0.00025, "loss": 0.1184, "macro_f1": 0.5492662787437439, "num_tokens": 199715.0, "repeat_count": 0.0, "routers_loss": 0.043020736426115036, "skip_count": 2.0, "step": 126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.6989761092150171, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.000254, "loss": 0.1421, "macro_f1": 0.32098767161369324, "num_tokens": 204217.0, "repeat_count": 0.0, "routers_loss": 0.0802314504981041, "skip_count": 1.0, "step": 128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7098976109215017, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0, "learning_rate": 0.00025800000000000004, "loss": 0.1719, "macro_f1": 0.32098764181137085, "num_tokens": 206777.0, "repeat_count": 1.0, "routers_loss": 0.09076520055532455, "skip_count": 1.0, "step": 130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.7208191126279864, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.9375, "learning_rate": 0.000262, "loss": 0.1423, "macro_f1": 0.3272727429866791, "num_tokens": 210838.0, "repeat_count": 0.0, "routers_loss": 0.024340573698282242, "skip_count": 0.0, "step": 132, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.731740614334471, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.875, "learning_rate": 0.000266, "loss": 0.1, "macro_f1": 0.3333333432674408, "num_tokens": 213498.0, "repeat_count": 0.0, "routers_loss": 0.016322199255228043, "skip_count": 0.0, "step": 134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7426621160409557, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.00027, "loss": 0.1408, "macro_f1": 0.3272727429866791, "num_tokens": 216998.0, "repeat_count": 0.0, "routers_loss": 0.042806077748537064, "skip_count": 1.0, "step": 136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7535836177474403, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.6875, "learning_rate": 0.00027400000000000005, "loss": 0.1012, "macro_f1": 0.32098764181137085, "num_tokens": 219952.0, "repeat_count": 0.0, "routers_loss": 0.12166574597358704, "skip_count": 2.0, "step": 138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.764505119453925, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.59375, "learning_rate": 0.00027800000000000004, "loss": 0.1576, "macro_f1": 0.32098767161369324, "num_tokens": 223326.0, "repeat_count": 0.0, "routers_loss": 0.12389889359474182, "skip_count": 1.0, "step": 140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.7754266211604095, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.46875, "learning_rate": 0.00028199999999999997, "loss": 0.1554, "macro_f1": 0.31446540355682373, "num_tokens": 226179.0, "repeat_count": 0.0, "routers_loss": 0.1315135806798935, "skip_count": 2.0, "step": 142, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7863481228668942, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.625, "learning_rate": 0.00028599999999999996, "loss": 0.1188, "macro_f1": 0.3272727429866791, "num_tokens": 228782.0, "repeat_count": 0.0, "routers_loss": 0.08095238357782364, "skip_count": 1.0, "step": 144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.7972696245733788, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5, "learning_rate": 0.00029, "loss": 0.1616, "macro_f1": 0.3076923191547394, "num_tokens": 231771.0, "repeat_count": 0.0, "routers_loss": 0.13997994363307953, "skip_count": 4.0, "step": 146, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8081911262798634, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0, "learning_rate": 0.000294, "loss": 0.1868, "macro_f1": 0.3333333432674408, "num_tokens": 234517.0, "repeat_count": 0.0, "routers_loss": 0.03245344012975693, "skip_count": 0.0, "step": 148, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 0.8191126279863481, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.375, "learning_rate": 0.000298, "loss": 0.148, "macro_f1": 0.3006536066532135, "num_tokens": 237324.0, "repeat_count": 1.0, "routers_loss": 0.36887046694755554, "skip_count": 2.0, "step": 150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8300341296928327, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.000302, "loss": 0.1759, "macro_f1": 0.3272727429866791, "num_tokens": 240657.0, "repeat_count": 1.0, "routers_loss": 0.1363309770822525, "skip_count": 0.0, "step": 152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8409556313993174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.65625, "learning_rate": 0.000306, "loss": 0.2043, "macro_f1": 0.3333333432674408, "num_tokens": 243741.0, "repeat_count": 0.0, "routers_loss": 0.024881718680262566, "skip_count": 0.0, "step": 154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 0.851877133105802, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 4.5625, "learning_rate": 0.00031, "loss": 0.1777, "macro_f1": 0.4326530694961548, "num_tokens": 246879.0, "repeat_count": 1.0, "routers_loss": 0.25227662920951843, "skip_count": 3.0, "step": 156, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 0.8627986348122867, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.28125, "learning_rate": 0.000314, "loss": 0.1641, "macro_f1": 0.47333335876464844, "num_tokens": 249880.0, "repeat_count": 2.0, "routers_loss": 0.3088915944099426, "skip_count": 3.0, "step": 158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 0.8737201365187713, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.59375, "learning_rate": 0.00031800000000000003, "loss": 0.1687, "macro_f1": 0.41777777671813965, "num_tokens": 252725.0, "repeat_count": 0.0, "routers_loss": 0.11272747814655304, "skip_count": 3.0, "step": 160, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 0.884641638225256, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.90625, "learning_rate": 0.000322, "loss": 0.1408, "macro_f1": 0.3144654333591461, "num_tokens": 255951.0, "repeat_count": 0.0, "routers_loss": 0.05064187943935394, "skip_count": 0.0, "step": 162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.8955631399317406, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.65625, "learning_rate": 0.000326, "loss": 0.1509, "macro_f1": 0.3076923191547394, "num_tokens": 259469.0, "repeat_count": 0.0, "routers_loss": 0.21262036263942719, "skip_count": 2.0, "step": 164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 0.9064846416382253, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.25, "learning_rate": 0.00033, "loss": 0.1578, "macro_f1": 0.4400000274181366, "num_tokens": 262272.0, "repeat_count": 1.0, "routers_loss": 0.1725386530160904, "skip_count": 3.0, "step": 166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.9174061433447099, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.4375, "learning_rate": 0.00033400000000000004, "loss": 0.1471, "macro_f1": 0.3272727429866791, "num_tokens": 266415.0, "repeat_count": 0.0, "routers_loss": 0.02629087306559086, "skip_count": 0.0, "step": 168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.9283276450511946, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.00033800000000000003, "loss": 0.1185, "macro_f1": 0.32098767161369324, "num_tokens": 269700.0, "repeat_count": 0.0, "routers_loss": 0.05510875955224037, "skip_count": 1.0, "step": 170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.9392491467576792, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.75, "learning_rate": 0.000342, "loss": 0.1637, "macro_f1": 0.3006536066532135, "num_tokens": 272587.0, "repeat_count": 1.0, "routers_loss": 0.27733829617500305, "skip_count": 3.0, "step": 172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 0.9501706484641638, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.000346, "loss": 0.2034, "macro_f1": 0.32098764181137085, "num_tokens": 277005.0, "repeat_count": 0.0, "routers_loss": 0.14457301795482635, "skip_count": 2.0, "step": 174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 0.9610921501706484, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.125, "learning_rate": 0.00035, "loss": 0.154, "macro_f1": 0.4871794879436493, "num_tokens": 279607.0, "repeat_count": 0.0, "routers_loss": 0.07571296393871307, "skip_count": 2.0, "step": 176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 0.9720136518771331, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000354, "loss": 0.1894, "macro_f1": 0.32098767161369324, "num_tokens": 282547.0, "repeat_count": 1.0, "routers_loss": 0.5549371838569641, "skip_count": 0.0, "step": 178, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.9829351535836177, "f1_execute": 0.9411765336990356, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.000358, "loss": 0.1226, "macro_f1": 0.5359477400779724, "num_tokens": 286081.0, "repeat_count": 2.0, "routers_loss": 0.2509016990661621, "skip_count": 2.0, "step": 180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 0.9938566552901024, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.000362, "loss": 0.1795, "macro_f1": 0.3272727429866791, "num_tokens": 289224.0, "repeat_count": 0.0, "routers_loss": 0.017457736656069756, "skip_count": 0.0, "step": 182, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.0, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.390625, "learning_rate": 0.000366, "loss": 0.1471, "macro_f1": 0.3272727429866791, "num_tokens": 290916.0, "repeat_count": 0.0, "routers_loss": 0.05112108215689659, "skip_count": 0.0, "step": 184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0109215017064845, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.6875, "learning_rate": 0.00037, "loss": 0.1459, "macro_f1": 0.3076923191547394, "num_tokens": 294182.0, "repeat_count": 3.0, "routers_loss": 0.5592358708381653, "skip_count": 1.0, "step": 186, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0218430034129693, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.000374, "loss": 0.1446, "macro_f1": 0.3333333432674408, "num_tokens": 296702.0, "repeat_count": 0.0, "routers_loss": 0.006012737285345793, "skip_count": 0.0, "step": 188, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.0327645051194538, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.96875, "learning_rate": 0.000378, "loss": 0.1394, "macro_f1": 0.31446540355682373, "num_tokens": 300348.0, "repeat_count": 0.0, "routers_loss": 0.06094537675380707, "skip_count": 2.0, "step": 190, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0436860068259386, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.625, "learning_rate": 0.000382, "loss": 0.0995, "macro_f1": 0.3272727429866791, "num_tokens": 303466.0, "repeat_count": 0.0, "routers_loss": 0.08475696295499802, "skip_count": 1.0, "step": 192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0546075085324231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.1875, "learning_rate": 0.000386, "loss": 0.1749, "macro_f1": 0.3333333432674408, "num_tokens": 306160.0, "repeat_count": 0.0, "routers_loss": 0.010187637060880661, "skip_count": 0.0, "step": 194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.065529010238908, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.28125, "learning_rate": 0.00039000000000000005, "loss": 0.1692, "macro_f1": 0.3076923191547394, "num_tokens": 309453.0, "repeat_count": 1.0, "routers_loss": 0.20142780244350433, "skip_count": 1.0, "step": 196, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.0764505119453924, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.00039400000000000004, "loss": 0.1283, "macro_f1": 0.3333333432674408, "num_tokens": 312138.0, "repeat_count": 0.0, "routers_loss": 0.015577984042465687, "skip_count": 0.0, "step": 198, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.0873720136518772, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.46875, "learning_rate": 0.000398, "loss": 0.1061, "macro_f1": 0.4803921580314636, "num_tokens": 315833.0, "repeat_count": 0.0, "routers_loss": 0.1465342938899994, "skip_count": 2.0, "step": 200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.0982935153583617, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.5625, "learning_rate": 0.000402, "loss": 0.1879, "macro_f1": 0.32098764181137085, "num_tokens": 318690.0, "repeat_count": 0.0, "routers_loss": 0.09964372962713242, "skip_count": 0.0, "step": 202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 1.1092150170648465, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.25, "learning_rate": 0.00040600000000000006, "loss": 0.1226, "macro_f1": 0.32098764181137085, "num_tokens": 322294.0, "repeat_count": 0.0, "routers_loss": 0.030282732099294662, "skip_count": 0.0, "step": 204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.120136518771331, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.00041, "loss": 0.1582, "macro_f1": 0.32098767161369324, "num_tokens": 325029.0, "repeat_count": 0.0, "routers_loss": 0.24788229167461395, "skip_count": 1.0, "step": 206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 1.1310580204778158, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.9375, "learning_rate": 0.000414, "loss": 0.2048, "macro_f1": 0.4871794879436493, "num_tokens": 328178.0, "repeat_count": 0.0, "routers_loss": 0.031264692544937134, "skip_count": 1.0, "step": 208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 1.1419795221843003, "f1_execute": 0.9166666269302368, "f1_repeat": 0.0, "f1_skip": 0.5714285373687744, "grad_norm": 6.8125, "learning_rate": 0.00041799999999999997, "loss": 0.1756, "macro_f1": 0.4960317313671112, "num_tokens": 331351.0, "repeat_count": 1.0, "routers_loss": 0.343823105096817, "skip_count": 4.0, "step": 210, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1529010238907849, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.84375, "learning_rate": 0.000422, "loss": 0.1246, "macro_f1": 0.3333333432674408, "num_tokens": 335297.0, "repeat_count": 0.0, "routers_loss": 0.014860679395496845, "skip_count": 0.0, "step": 212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.1638225255972696, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.46875, "learning_rate": 0.000426, "loss": 0.1537, "macro_f1": 0.3006536066532135, "num_tokens": 338427.0, "repeat_count": 1.0, "routers_loss": 0.33231568336486816, "skip_count": 3.0, "step": 214, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.1747440273037544, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.59375, "learning_rate": 0.00043, "loss": 0.1546, "macro_f1": 0.3333333432674408, "num_tokens": 341158.0, "repeat_count": 0.0, "routers_loss": 0.007448212709277868, "skip_count": 0.0, "step": 216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.185665529010239, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.28125, "learning_rate": 0.00043400000000000003, "loss": 0.1468, "macro_f1": 0.3272727429866791, "num_tokens": 344329.0, "repeat_count": 0.0, "routers_loss": 0.02311822399497032, "skip_count": 0.0, "step": 218, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.1965870307167235, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.000438, "loss": 0.1307, "macro_f1": 0.32098767161369324, "num_tokens": 348948.0, "repeat_count": 0.0, "routers_loss": 0.02867077849805355, "skip_count": 1.0, "step": 220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.2075085324232082, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.96875, "learning_rate": 0.000442, "loss": 0.2046, "macro_f1": 0.5492662787437439, "num_tokens": 351741.0, "repeat_count": 0.0, "routers_loss": 0.03160649910569191, "skip_count": 2.0, "step": 222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.2184300341296928, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.000446, "loss": 0.2074, "macro_f1": 0.3272727429866791, "num_tokens": 354852.0, "repeat_count": 1.0, "routers_loss": 0.1611160784959793, "skip_count": 0.0, "step": 224, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 1.2293515358361775, "f1_execute": 0.8695651888847351, "f1_repeat": 0.4000000059604645, "f1_skip": 0.4000000059604645, "grad_norm": 3.328125, "learning_rate": 0.00045000000000000004, "loss": 0.118, "macro_f1": 0.5565217733383179, "num_tokens": 357431.0, "repeat_count": 2.0, "routers_loss": 0.7632720470428467, "skip_count": 3.0, "step": 226, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.240273037542662, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.59375, "learning_rate": 0.00045400000000000003, "loss": 0.0965, "macro_f1": 0.32098767161369324, "num_tokens": 360192.0, "repeat_count": 0.0, "routers_loss": 0.08349918574094772, "skip_count": 1.0, "step": 228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 1.2511945392491468, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.9375, "learning_rate": 0.000458, "loss": 0.1714, "macro_f1": 0.4871794879436493, "num_tokens": 363209.0, "repeat_count": 0.0, "routers_loss": 0.06626693904399872, "skip_count": 2.0, "step": 230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.2621160409556313, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.25, "learning_rate": 0.000462, "loss": 0.1859, "macro_f1": 0.3272727429866791, "num_tokens": 368262.0, "repeat_count": 0.0, "routers_loss": 0.03743857145309448, "skip_count": 0.0, "step": 232, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.273037542662116, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.34375, "learning_rate": 0.00046600000000000005, "loss": 0.2281, "macro_f1": 0.31446540355682373, "num_tokens": 370737.0, "repeat_count": 1.0, "routers_loss": 0.12340149283409119, "skip_count": 0.0, "step": 234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.2839590443686006, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.8125, "learning_rate": 0.00047, "loss": 0.1535, "macro_f1": 0.32098764181137085, "num_tokens": 373272.0, "repeat_count": 0.0, "routers_loss": 0.04501926526427269, "skip_count": 0.0, "step": 236, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.2948805460750854, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.625, "learning_rate": 0.000474, "loss": 0.1701, "macro_f1": 0.3076923191547394, "num_tokens": 376924.0, "repeat_count": 1.0, "routers_loss": 0.3543643057346344, "skip_count": 1.0, "step": 238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 1.30580204778157, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.78125, "learning_rate": 0.00047799999999999996, "loss": 0.1553, "macro_f1": 0.4400000274181366, "num_tokens": 380034.0, "repeat_count": 1.0, "routers_loss": 0.1332877278327942, "skip_count": 4.0, "step": 240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3167235494880547, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.125, "learning_rate": 0.000482, "loss": 0.0874, "macro_f1": 0.3333333432674408, "num_tokens": 382846.0, "repeat_count": 0.0, "routers_loss": 0.013933669775724411, "skip_count": 0.0, "step": 242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3276450511945392, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.765625, "learning_rate": 0.000486, "loss": 0.1505, "macro_f1": 0.3272727429866791, "num_tokens": 385916.0, "repeat_count": 0.0, "routers_loss": 0.11566327512264252, "skip_count": 1.0, "step": 244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.3385665529010238, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.9375, "learning_rate": 0.00049, "loss": 0.1634, "macro_f1": 0.3272727429866791, "num_tokens": 388768.0, "repeat_count": 0.0, "routers_loss": 0.015394577756524086, "skip_count": 0.0, "step": 246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.3494880546075085, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.78125, "learning_rate": 0.000494, "loss": 0.1493, "macro_f1": 0.32098764181137085, "num_tokens": 391699.0, "repeat_count": 0.0, "routers_loss": 0.05529753863811493, "skip_count": 0.0, "step": 248, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.3604095563139933, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.75, "learning_rate": 0.000498, "loss": 0.2545, "macro_f1": 0.31446540355682373, "num_tokens": 395380.0, "repeat_count": 1.0, "routers_loss": 0.15498189628124237, "skip_count": 1.0, "step": 250, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.3713310580204778, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.8125, "learning_rate": 0.0005020000000000001, "loss": 0.1998, "macro_f1": 0.31446540355682373, "num_tokens": 398414.0, "repeat_count": 0.0, "routers_loss": 0.053408559411764145, "skip_count": 2.0, "step": 252, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.3822525597269624, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.000506, "loss": 0.1761, "macro_f1": 0.31446540355682373, "num_tokens": 401690.0, "repeat_count": 0.0, "routers_loss": 0.15143637359142303, "skip_count": 1.0, "step": 254, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.3931740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.796875, "learning_rate": 0.00051, "loss": 0.1638, "macro_f1": 0.3272727429866791, "num_tokens": 404533.0, "repeat_count": 0.0, "routers_loss": 0.036931805312633514, "skip_count": 1.0, "step": 256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.4040955631399317, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 7.21875, "learning_rate": 0.000514, "loss": 0.1765, "macro_f1": 0.5427350401878357, "num_tokens": 408175.0, "repeat_count": 1.0, "routers_loss": 0.16898785531520844, "skip_count": 2.0, "step": 258, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 1.4150170648464164, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.875, "learning_rate": 0.000518, "loss": 0.2172, "macro_f1": 0.4871794879436493, "num_tokens": 411160.0, "repeat_count": 0.0, "routers_loss": 0.05883602425456047, "skip_count": 1.0, "step": 260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.425938566552901, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.453125, "learning_rate": 0.000522, "loss": 0.1121, "macro_f1": 0.31446540355682373, "num_tokens": 414391.0, "repeat_count": 0.0, "routers_loss": 0.14810606837272644, "skip_count": 2.0, "step": 262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4368600682593857, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.000526, "loss": 0.1772, "macro_f1": 0.3272727429866791, "num_tokens": 417763.0, "repeat_count": 1.0, "routers_loss": 0.20452100038528442, "skip_count": 0.0, "step": 264, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 1.4477815699658703, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 3.5, "learning_rate": 0.0005300000000000001, "loss": 0.1446, "macro_f1": 0.4326530694961548, "num_tokens": 421881.0, "repeat_count": 2.0, "routers_loss": 0.32300108671188354, "skip_count": 3.0, "step": 266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 27.0, "epoch": 1.458703071672355, "f1_execute": 0.8260869383811951, "f1_repeat": 0.0, "f1_skip": 0.2857142984867096, "grad_norm": 3.96875, "learning_rate": 0.0005340000000000001, "loss": 0.1377, "macro_f1": 0.3706004321575165, "num_tokens": 424938.0, "repeat_count": 2.0, "routers_loss": 0.5530142784118652, "skip_count": 5.0, "step": 268, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.4696245733788396, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5625, "learning_rate": 0.0005380000000000001, "loss": 0.1457, "macro_f1": 0.307692289352417, "num_tokens": 427555.0, "repeat_count": 0.0, "routers_loss": 0.10682675242424011, "skip_count": 3.0, "step": 270, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.480546075085324, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.0005420000000000001, "loss": 0.174, "macro_f1": 0.3144654333591461, "num_tokens": 430168.0, "repeat_count": 1.0, "routers_loss": 0.9753395318984985, "skip_count": 2.0, "step": 272, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.4914675767918089, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.75, "learning_rate": 0.000546, "loss": 0.1441, "macro_f1": 0.3333333432674408, "num_tokens": 433358.0, "repeat_count": 0.0, "routers_loss": 0.021224403753876686, "skip_count": 0.0, "step": 274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5023890784982936, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.78125, "learning_rate": 0.00055, "loss": 0.1624, "macro_f1": 0.32098764181137085, "num_tokens": 436460.0, "repeat_count": 0.0, "routers_loss": 0.08185791224241257, "skip_count": 2.0, "step": 276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 1.5133105802047782, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.40625, "learning_rate": 0.000554, "loss": 0.1677, "macro_f1": 0.3144654333591461, "num_tokens": 439531.0, "repeat_count": 0.0, "routers_loss": 0.037240445613861084, "skip_count": 0.0, "step": 278, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5242320819112627, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.765625, "learning_rate": 0.000558, "loss": 0.2688, "macro_f1": 0.3006536066532135, "num_tokens": 442521.0, "repeat_count": 1.0, "routers_loss": 0.3406132459640503, "skip_count": 3.0, "step": 280, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5351535836177475, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.0005620000000000001, "loss": 0.0875, "macro_f1": 0.3333333432674408, "num_tokens": 444942.0, "repeat_count": 0.0, "routers_loss": 0.006758399773389101, "skip_count": 0.0, "step": 282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5460750853242322, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000566, "loss": 0.1597, "macro_f1": 0.3144654333591461, "num_tokens": 448193.0, "repeat_count": 0.0, "routers_loss": 0.06801790744066238, "skip_count": 0.0, "step": 284, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 27.0, "epoch": 1.5569965870307167, "f1_execute": 0.8510637879371643, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 4.78125, "learning_rate": 0.00057, "loss": 0.2027, "macro_f1": 0.39479905366897583, "num_tokens": 451293.0, "repeat_count": 3.0, "routers_loss": 0.23832914233207703, "skip_count": 5.0, "step": 286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.5679180887372013, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.40625, "learning_rate": 0.000574, "loss": 0.1361, "macro_f1": 0.3272727429866791, "num_tokens": 454069.0, "repeat_count": 1.0, "routers_loss": 0.14267782866954803, "skip_count": 0.0, "step": 288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.578839590443686, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.65625, "learning_rate": 0.000578, "loss": 0.1921, "macro_f1": 0.31446540355682373, "num_tokens": 457308.0, "repeat_count": 0.0, "routers_loss": 0.3219856917858124, "skip_count": 2.0, "step": 290, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.5897610921501708, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.90625, "learning_rate": 0.0005819999999999999, "loss": 0.2214, "macro_f1": 0.31446540355682373, "num_tokens": 460138.0, "repeat_count": 1.0, "routers_loss": 0.4478992521762848, "skip_count": 1.0, "step": 292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6006825938566553, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.9375, "learning_rate": 0.0005859999999999999, "loss": 0.2102, "macro_f1": 0.3333333432674408, "num_tokens": 464029.0, "repeat_count": 0.0, "routers_loss": 0.019972749054431915, "skip_count": 0.0, "step": 294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6116040955631399, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.00059, "loss": 0.1164, "macro_f1": 0.3076923191547394, "num_tokens": 467500.0, "repeat_count": 1.0, "routers_loss": 0.14752870798110962, "skip_count": 3.0, "step": 296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6225255972696244, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.75, "learning_rate": 0.000594, "loss": 0.1434, "macro_f1": 0.32098764181137085, "num_tokens": 470734.0, "repeat_count": 1.0, "routers_loss": 0.30419600009918213, "skip_count": 1.0, "step": 298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.6334470989761092, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.000598, "loss": 0.2077, "macro_f1": 0.31446540355682373, "num_tokens": 474514.0, "repeat_count": 0.0, "routers_loss": 0.06921514868736267, "skip_count": 2.0, "step": 300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.644368600682594, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.125, "learning_rate": 0.000602, "loss": 0.1566, "macro_f1": 0.3076923191547394, "num_tokens": 477393.0, "repeat_count": 0.0, "routers_loss": 0.2468976378440857, "skip_count": 2.0, "step": 302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.6552901023890785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.000606, "loss": 0.1649, "macro_f1": 0.3272727429866791, "num_tokens": 480381.0, "repeat_count": 0.0, "routers_loss": 0.020447812974452972, "skip_count": 0.0, "step": 304, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.666211604095563, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5, "learning_rate": 0.00061, "loss": 0.1423, "macro_f1": 0.31446540355682373, "num_tokens": 483502.0, "repeat_count": 0.0, "routers_loss": 0.05023586004972458, "skip_count": 1.0, "step": 306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 1.6771331058020478, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0, "learning_rate": 0.000614, "loss": 0.2042, "macro_f1": 0.3144654333591461, "num_tokens": 488006.0, "repeat_count": 0.0, "routers_loss": 0.049936871975660324, "skip_count": 0.0, "step": 308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.6880546075085325, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.0006180000000000001, "loss": 0.2121, "macro_f1": 0.3272727429866791, "num_tokens": 491611.0, "repeat_count": 1.0, "routers_loss": 0.20010031759738922, "skip_count": 0.0, "step": 310, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.698976109215017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.000622, "loss": 0.2415, "macro_f1": 0.3333333432674408, "num_tokens": 494903.0, "repeat_count": 0.0, "routers_loss": 0.01630268059670925, "skip_count": 0.0, "step": 312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.7098976109215016, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000626, "loss": 0.2042, "macro_f1": 0.32098767161369324, "num_tokens": 497949.0, "repeat_count": 0.0, "routers_loss": 0.2674679160118103, "skip_count": 1.0, "step": 314, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 1.7208191126279864, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.5, "learning_rate": 0.00063, "loss": 0.1844, "macro_f1": 0.8823530077934265, "num_tokens": 501082.0, "repeat_count": 1.0, "routers_loss": 0.1621737778186798, "skip_count": 2.0, "step": 316, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.7317406143344711, "f1_execute": 0.8979592323303223, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 9.125, "learning_rate": 0.000634, "loss": 0.1708, "macro_f1": 0.5215420126914978, "num_tokens": 504131.0, "repeat_count": 2.0, "routers_loss": 0.6877225041389465, "skip_count": 2.0, "step": 318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 1.7426621160409557, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.6875, "learning_rate": 0.000638, "loss": 0.1874, "macro_f1": 0.29333335161209106, "num_tokens": 507012.0, "repeat_count": 0.0, "routers_loss": 0.14521881937980652, "skip_count": 2.0, "step": 320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 23.0, "epoch": 1.7535836177474402, "f1_execute": 0.8936170339584351, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 4.46875, "learning_rate": 0.000642, "loss": 0.1489, "macro_f1": 0.44602054357528687, "num_tokens": 509950.0, "repeat_count": 0.0, "routers_loss": 0.15650968253612518, "skip_count": 4.0, "step": 322, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 1.764505119453925, "f1_execute": 0.8333333730697632, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.96875, "learning_rate": 0.000646, "loss": 0.163, "macro_f1": 0.2777777910232544, "num_tokens": 512900.0, "repeat_count": 2.0, "routers_loss": 0.3924711048603058, "skip_count": 3.0, "step": 324, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 1.7754266211604095, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.546875, "learning_rate": 0.0006500000000000001, "loss": 0.1452, "macro_f1": 0.5492662787437439, "num_tokens": 516233.0, "repeat_count": 0.0, "routers_loss": 0.038907092064619064, "skip_count": 2.0, "step": 326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.7863481228668943, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.796875, "learning_rate": 0.0006540000000000001, "loss": 0.1641, "macro_f1": 0.3333333432674408, "num_tokens": 519636.0, "repeat_count": 0.0, "routers_loss": 0.0022514634765684605, "skip_count": 0.0, "step": 328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 1.7972696245733788, "f1_execute": 0.9166666865348816, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 7.03125, "learning_rate": 0.0006580000000000001, "loss": 0.2761, "macro_f1": 0.4722222685813904, "num_tokens": 522992.0, "repeat_count": 2.0, "routers_loss": 0.4415050148963928, "skip_count": 2.0, "step": 330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.8081911262798633, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.4375, "learning_rate": 0.000662, "loss": 0.1657, "macro_f1": 0.32098767161369324, "num_tokens": 526843.0, "repeat_count": 0.0, "routers_loss": 0.06788615882396698, "skip_count": 1.0, "step": 332, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.819112627986348, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 5.78125, "learning_rate": 0.000666, "loss": 0.1996, "macro_f1": 0.6603773832321167, "num_tokens": 530177.0, "repeat_count": 1.0, "routers_loss": 0.06985973566770554, "skip_count": 1.0, "step": 334, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.8300341296928329, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.46875, "learning_rate": 0.00067, "loss": 0.1877, "macro_f1": 0.307692289352417, "num_tokens": 533183.0, "repeat_count": 1.0, "routers_loss": 0.33230671286582947, "skip_count": 2.0, "step": 336, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8409556313993174, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.6875, "learning_rate": 0.000674, "loss": 0.1249, "macro_f1": 0.3076923191547394, "num_tokens": 536858.0, "repeat_count": 0.0, "routers_loss": 0.15104004740715027, "skip_count": 2.0, "step": 338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.851877133105802, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.625, "learning_rate": 0.0006780000000000001, "loss": 0.1885, "macro_f1": 0.3272727429866791, "num_tokens": 540769.0, "repeat_count": 0.0, "routers_loss": 0.032123174518346786, "skip_count": 0.0, "step": 340, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8627986348122867, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.96875, "learning_rate": 0.0006820000000000001, "loss": 0.1809, "macro_f1": 0.3272727429866791, "num_tokens": 543783.0, "repeat_count": 0.0, "routers_loss": 0.05651572719216347, "skip_count": 1.0, "step": 342, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.8737201365187715, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.6875, "learning_rate": 0.0006860000000000001, "loss": 0.1804, "macro_f1": 0.3076923191547394, "num_tokens": 547125.0, "repeat_count": 0.0, "routers_loss": 0.13617995381355286, "skip_count": 2.0, "step": 344, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.884641638225256, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.65625, "learning_rate": 0.00069, "loss": 0.204, "macro_f1": 0.3272727429866791, "num_tokens": 550591.0, "repeat_count": 0.0, "routers_loss": 0.023369189351797104, "skip_count": 0.0, "step": 346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.8955631399317405, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.625, "learning_rate": 0.000694, "loss": 0.2275, "macro_f1": 0.3272727429866791, "num_tokens": 553785.0, "repeat_count": 0.0, "routers_loss": 0.09765879064798355, "skip_count": 1.0, "step": 348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9064846416382253, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.5, "learning_rate": 0.0006979999999999999, "loss": 0.4191, "macro_f1": 0.3333333432674408, "num_tokens": 556135.0, "repeat_count": 0.0, "routers_loss": 0.011158714070916176, "skip_count": 0.0, "step": 350, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.91740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.53125, "learning_rate": 0.0007019999999999999, "loss": 0.1557, "macro_f1": 0.3272727429866791, "num_tokens": 558980.0, "repeat_count": 0.0, "routers_loss": 0.036593515425920486, "skip_count": 0.0, "step": 352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 26.0, "epoch": 1.9283276450511946, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 4.1875, "learning_rate": 0.0007059999999999999, "loss": 0.183, "macro_f1": 0.4104308485984802, "num_tokens": 562187.0, "repeat_count": 1.0, "routers_loss": 0.48064568638801575, "skip_count": 4.0, "step": 354, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.9392491467576791, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.0, "learning_rate": 0.00071, "loss": 0.1982, "macro_f1": 0.32098767161369324, "num_tokens": 565278.0, "repeat_count": 0.0, "routers_loss": 0.13826458156108856, "skip_count": 1.0, "step": 356, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 1.9501706484641637, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.90625, "learning_rate": 0.000714, "loss": 0.2709, "macro_f1": 0.3333333432674408, "num_tokens": 567869.0, "repeat_count": 0.0, "routers_loss": 0.01589345932006836, "skip_count": 0.0, "step": 358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 1.9610921501706484, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.09375, "learning_rate": 0.000718, "loss": 0.1902, "macro_f1": 0.3272727429866791, "num_tokens": 571069.0, "repeat_count": 0.0, "routers_loss": 0.029062755405902863, "skip_count": 0.0, "step": 360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 1.9720136518771332, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.125, "learning_rate": 0.000722, "loss": 0.2125, "macro_f1": 0.3076923191547394, "num_tokens": 573838.0, "repeat_count": 1.0, "routers_loss": 0.3241157531738281, "skip_count": 1.0, "step": 362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 1.9829351535836177, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.46875, "learning_rate": 0.000726, "loss": 0.2176, "macro_f1": 0.3272727429866791, "num_tokens": 576554.0, "repeat_count": 0.0, "routers_loss": 0.03469887003302574, "skip_count": 0.0, "step": 364, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 1.9938566552901023, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 7.34375, "learning_rate": 0.00073, "loss": 0.182, "macro_f1": 0.4803921580314636, "num_tokens": 579653.0, "repeat_count": 1.0, "routers_loss": 0.11800751090049744, "skip_count": 1.0, "step": 366, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 15.125, "learning_rate": 0.000734, "loss": 0.3307, "macro_f1": 0.3333333432674408, "num_tokens": 581832.0, "repeat_count": 0.0, "routers_loss": 0.014465595595538616, "skip_count": 0.0, "step": 368, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.0109215017064845, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.375, "learning_rate": 0.000738, "loss": 0.1482, "macro_f1": 0.3272727429866791, "num_tokens": 585207.0, "repeat_count": 0.0, "routers_loss": 0.030198052525520325, "skip_count": 0.0, "step": 370, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.021843003412969, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.78125, "learning_rate": 0.000742, "loss": 0.0906, "macro_f1": 0.32098767161369324, "num_tokens": 588893.0, "repeat_count": 0.0, "routers_loss": 0.04226446524262428, "skip_count": 1.0, "step": 372, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 2.032764505119454, "f1_execute": 0.9777777791023254, "f1_repeat": 0.800000011920929, "f1_skip": 1.0, "grad_norm": 8.0625, "learning_rate": 0.000746, "loss": 0.2092, "macro_f1": 0.9259259104728699, "num_tokens": 592246.0, "repeat_count": 3.0, "routers_loss": 0.05995782092213631, "skip_count": 3.0, "step": 374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.0436860068259386, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.00075, "loss": 0.1724, "macro_f1": 0.3006536066532135, "num_tokens": 594777.0, "repeat_count": 0.0, "routers_loss": 0.14366891980171204, "skip_count": 3.0, "step": 376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.054607508532423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.90625, "learning_rate": 0.000754, "loss": 0.0803, "macro_f1": 0.3333333432674408, "num_tokens": 597931.0, "repeat_count": 0.0, "routers_loss": 0.0027963866014033556, "skip_count": 0.0, "step": 378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 2.0655290102389077, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.28125, "learning_rate": 0.000758, "loss": 0.2873, "macro_f1": 0.5359477400779724, "num_tokens": 601227.0, "repeat_count": 0.0, "routers_loss": 0.15012779831886292, "skip_count": 2.0, "step": 380, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.0764505119453927, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 4.96875, "learning_rate": 0.000762, "loss": 0.1602, "macro_f1": 0.5427350401878357, "num_tokens": 604297.0, "repeat_count": 2.0, "routers_loss": 0.0708698183298111, "skip_count": 1.0, "step": 382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.20000000298023224, "avg_layers": 28.0, "epoch": 2.087372013651877, "f1_execute": 0.8510638475418091, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 8.25, "learning_rate": 0.0007660000000000001, "loss": 0.1786, "macro_f1": 0.3947990834712982, "num_tokens": 607137.0, "repeat_count": 2.0, "routers_loss": 0.46035754680633545, "skip_count": 5.0, "step": 384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.0982935153583617, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.8125, "learning_rate": 0.0007700000000000001, "loss": 0.1415, "macro_f1": 0.4871794879436493, "num_tokens": 610067.0, "repeat_count": 0.0, "routers_loss": 0.04594701901078224, "skip_count": 2.0, "step": 386, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 26.0, "epoch": 2.1092150170648463, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 5.59375, "learning_rate": 0.0007740000000000001, "loss": 0.1453, "macro_f1": 0.42403626441955566, "num_tokens": 613020.0, "repeat_count": 1.0, "routers_loss": 0.21872307360172272, "skip_count": 4.0, "step": 388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1201365187713312, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.000778, "loss": 0.2459, "macro_f1": 0.3006536066532135, "num_tokens": 615777.0, "repeat_count": 0.0, "routers_loss": 0.17068128287792206, "skip_count": 3.0, "step": 390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.131058020477816, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.40625, "learning_rate": 0.000782, "loss": 0.1734, "macro_f1": 0.5492662787437439, "num_tokens": 618883.0, "repeat_count": 0.0, "routers_loss": 0.06883871555328369, "skip_count": 2.0, "step": 392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.1419795221843003, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 4.4375, "learning_rate": 0.000786, "loss": 0.1822, "macro_f1": 0.4871794879436493, "num_tokens": 621785.0, "repeat_count": 0.0, "routers_loss": 0.021629702299833298, "skip_count": 2.0, "step": 394, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.152901023890785, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 10.4375, "learning_rate": 0.00079, "loss": 0.2188, "macro_f1": 0.4871794879436493, "num_tokens": 624497.0, "repeat_count": 0.0, "routers_loss": 0.02989846095442772, "skip_count": 2.0, "step": 396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1638225255972694, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.0007940000000000001, "loss": 0.2, "macro_f1": 0.3333333432674408, "num_tokens": 627530.0, "repeat_count": 0.0, "routers_loss": 0.0030090075451880693, "skip_count": 0.0, "step": 398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.1747440273037544, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.0007980000000000001, "loss": 0.1503, "macro_f1": 0.3272727429866791, "num_tokens": 630816.0, "repeat_count": 0.0, "routers_loss": 0.02026674523949623, "skip_count": 0.0, "step": 400, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.185665529010239, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.6875, "learning_rate": 0.0008020000000000001, "loss": 0.1285, "macro_f1": 0.3272727429866791, "num_tokens": 633715.0, "repeat_count": 1.0, "routers_loss": 0.08777285367250443, "skip_count": 0.0, "step": 402, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.1965870307167235, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.59375, "learning_rate": 0.0008060000000000001, "loss": 0.186, "macro_f1": 0.3272727429866791, "num_tokens": 636871.0, "repeat_count": 0.0, "routers_loss": 0.049915000796318054, "skip_count": 1.0, "step": 404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.34375, "learning_rate": 0.0008100000000000001, "loss": 0.1592, "macro_f1": 0.5492662787437439, "num_tokens": 639784.0, "repeat_count": 0.0, "routers_loss": 0.05443386733531952, "skip_count": 2.0, "step": 406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.218430034129693, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.90625, "learning_rate": 0.0008139999999999999, "loss": 0.1947, "macro_f1": 0.3272727429866791, "num_tokens": 642682.0, "repeat_count": 0.0, "routers_loss": 0.021953796967864037, "skip_count": 0.0, "step": 408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2293515358361775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.625, "learning_rate": 0.0008179999999999999, "loss": 0.2197, "macro_f1": 0.3333333432674408, "num_tokens": 645962.0, "repeat_count": 0.0, "routers_loss": 0.010657553561031818, "skip_count": 0.0, "step": 410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.240273037542662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.421875, "learning_rate": 0.0008219999999999999, "loss": 0.2091, "macro_f1": 0.3333333432674408, "num_tokens": 649180.0, "repeat_count": 0.0, "routers_loss": 0.013879667967557907, "skip_count": 0.0, "step": 412, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.2511945392491466, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0625, "learning_rate": 0.000826, "loss": 0.1555, "macro_f1": 0.31446540355682373, "num_tokens": 653015.0, "repeat_count": 0.0, "routers_loss": 0.12807206809520721, "skip_count": 2.0, "step": 414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 25.0, "epoch": 2.2621160409556316, "f1_execute": 0.9166666269302368, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.453125, "learning_rate": 0.00083, "loss": 0.1335, "macro_f1": 0.5277777910232544, "num_tokens": 655892.0, "repeat_count": 2.0, "routers_loss": 0.8250671625137329, "skip_count": 3.0, "step": 416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.273037542662116, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.8125, "learning_rate": 0.000834, "loss": 0.1831, "macro_f1": 0.5492662787437439, "num_tokens": 658426.0, "repeat_count": 0.0, "routers_loss": 0.03139641508460045, "skip_count": 2.0, "step": 418, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.2839590443686006, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 3.40625, "learning_rate": 0.000838, "loss": 0.1345, "macro_f1": 0.5427350401878357, "num_tokens": 661809.0, "repeat_count": 2.0, "routers_loss": 0.0441780611872673, "skip_count": 0.0, "step": 420, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.294880546075085, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000842, "loss": 0.1127, "macro_f1": 0.3272727429866791, "num_tokens": 664874.0, "repeat_count": 0.0, "routers_loss": 0.44332680106163025, "skip_count": 1.0, "step": 422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3058020477815697, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.96875, "learning_rate": 0.000846, "loss": 0.1225, "macro_f1": 0.3272727429866791, "num_tokens": 668325.0, "repeat_count": 0.0, "routers_loss": 0.059455983340740204, "skip_count": 0.0, "step": 424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.3167235494880547, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.8125, "learning_rate": 0.00085, "loss": 0.1816, "macro_f1": 0.5359477400779724, "num_tokens": 671097.0, "repeat_count": 2.0, "routers_loss": 0.3154633641242981, "skip_count": 2.0, "step": 426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 25.0, "epoch": 2.3276450511945392, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 5.65625, "learning_rate": 0.000854, "loss": 0.122, "macro_f1": 0.4104308784008026, "num_tokens": 674042.0, "repeat_count": 1.0, "routers_loss": 0.4580267667770386, "skip_count": 3.0, "step": 428, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3385665529010238, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.21875, "learning_rate": 0.000858, "loss": 0.1113, "macro_f1": 0.3272727429866791, "num_tokens": 677016.0, "repeat_count": 0.0, "routers_loss": 0.015222650021314621, "skip_count": 0.0, "step": 430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3494880546075088, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.78125, "learning_rate": 0.000862, "loss": 0.1379, "macro_f1": 0.3333333432674408, "num_tokens": 679990.0, "repeat_count": 1.0, "routers_loss": 0.24279196560382843, "skip_count": 0.0, "step": 432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.3604095563139933, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.28125, "learning_rate": 0.000866, "loss": 0.1476, "macro_f1": 0.3333333432674408, "num_tokens": 682786.0, "repeat_count": 1.0, "routers_loss": 0.1684337556362152, "skip_count": 0.0, "step": 434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.371331058020478, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.0, "learning_rate": 0.00087, "loss": 0.1204, "macro_f1": 0.3272727429866791, "num_tokens": 685882.0, "repeat_count": 1.0, "routers_loss": 0.19464725255966187, "skip_count": 0.0, "step": 436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.3822525597269624, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.71875, "learning_rate": 0.000874, "loss": 0.1124, "macro_f1": 0.32098764181137085, "num_tokens": 689570.0, "repeat_count": 0.0, "routers_loss": 0.05968143790960312, "skip_count": 2.0, "step": 438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.393174061433447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.84375, "learning_rate": 0.000878, "loss": 0.1528, "macro_f1": 0.3333333432674408, "num_tokens": 693559.0, "repeat_count": 0.0, "routers_loss": 0.004517437424510717, "skip_count": 0.0, "step": 440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.404095563139932, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.125, "learning_rate": 0.000882, "loss": 0.1353, "macro_f1": 0.3006536066532135, "num_tokens": 696374.0, "repeat_count": 0.0, "routers_loss": 0.26632770895957947, "skip_count": 2.0, "step": 442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.4150170648464164, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.75, "learning_rate": 0.0008860000000000001, "loss": 0.1874, "macro_f1": 0.2857142984867096, "num_tokens": 699954.0, "repeat_count": 1.0, "routers_loss": 0.3751397728919983, "skip_count": 3.0, "step": 444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.425938566552901, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.28125, "learning_rate": 0.0008900000000000001, "loss": 0.2139, "macro_f1": 0.32098764181137085, "num_tokens": 703477.0, "repeat_count": 0.0, "routers_loss": 0.2166936844587326, "skip_count": 2.0, "step": 446, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.4368600682593855, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0625, "learning_rate": 0.000894, "loss": 0.3078, "macro_f1": 0.3333333432674408, "num_tokens": 706342.0, "repeat_count": 0.0, "routers_loss": 0.004165076185017824, "skip_count": 0.0, "step": 448, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.4477815699658705, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.000898, "loss": 0.3248, "macro_f1": 0.307692289352417, "num_tokens": 709048.0, "repeat_count": 0.0, "routers_loss": 0.11787679046392441, "skip_count": 1.0, "step": 450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.458703071672355, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.875, "learning_rate": 0.000902, "loss": 0.2151, "macro_f1": 0.31446540355682373, "num_tokens": 712168.0, "repeat_count": 2.0, "routers_loss": 0.24694015085697174, "skip_count": 0.0, "step": 452, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.4696245733788396, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 9.0625, "learning_rate": 0.000906, "loss": 0.1899, "macro_f1": 0.5492662787437439, "num_tokens": 715867.0, "repeat_count": 0.0, "routers_loss": 0.14055466651916504, "skip_count": 2.0, "step": 454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.480546075085324, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.84375, "learning_rate": 0.00091, "loss": 0.136, "macro_f1": 0.32098764181137085, "num_tokens": 718940.0, "repeat_count": 0.0, "routers_loss": 0.2996567487716675, "skip_count": 2.0, "step": 456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.491467576791809, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 8.5625, "learning_rate": 0.0009140000000000001, "loss": 0.2439, "macro_f1": 0.5492662787437439, "num_tokens": 721407.0, "repeat_count": 0.0, "routers_loss": 0.032011453062295914, "skip_count": 2.0, "step": 458, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.5023890784982936, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.0, "learning_rate": 0.0009180000000000001, "loss": 0.2592, "macro_f1": 0.3144654333591461, "num_tokens": 726056.0, "repeat_count": 0.0, "routers_loss": 0.06647517532110214, "skip_count": 0.0, "step": 460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.513310580204778, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.875, "learning_rate": 0.0009220000000000001, "loss": 0.1904, "macro_f1": 0.32098764181137085, "num_tokens": 729038.0, "repeat_count": 0.0, "routers_loss": 0.08919267356395721, "skip_count": 0.0, "step": 462, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.5242320819112627, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.46875, "learning_rate": 0.0009260000000000001, "loss": 0.1969, "macro_f1": 0.3006536066532135, "num_tokens": 732172.0, "repeat_count": 0.0, "routers_loss": 0.4903416037559509, "skip_count": 2.0, "step": 464, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 2.5351535836177472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.5, "learning_rate": 0.00093, "loss": 0.1957, "macro_f1": 0.6666666865348816, "num_tokens": 735282.0, "repeat_count": 0.0, "routers_loss": 0.025489339604973793, "skip_count": 2.0, "step": 466, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.546075085324232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.28125, "learning_rate": 0.000934, "loss": 0.2198, "macro_f1": 0.3333333432674408, "num_tokens": 739208.0, "repeat_count": 0.0, "routers_loss": 0.013121264986693859, "skip_count": 0.0, "step": 468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.5569965870307167, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.15625, "learning_rate": 0.0009379999999999999, "loss": 0.3641, "macro_f1": 0.32098764181137085, "num_tokens": 741980.0, "repeat_count": 0.0, "routers_loss": 0.45740270614624023, "skip_count": 2.0, "step": 470, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.5679180887372013, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000942, "loss": 0.1668, "macro_f1": 0.31446540355682373, "num_tokens": 745551.0, "repeat_count": 0.0, "routers_loss": 0.1244814470410347, "skip_count": 2.0, "step": 472, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.5788395904436863, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.25, "learning_rate": 0.000946, "loss": 0.2807, "macro_f1": 0.2857142984867096, "num_tokens": 748488.0, "repeat_count": 1.0, "routers_loss": 0.3303976058959961, "skip_count": 3.0, "step": 474, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 2.589761092150171, "f1_execute": 0.9411764740943909, "f1_repeat": 0.4000000059604645, "f1_skip": 0.0, "grad_norm": 3.640625, "learning_rate": 0.00095, "loss": 0.1353, "macro_f1": 0.44705885648727417, "num_tokens": 752865.0, "repeat_count": 3.0, "routers_loss": 0.24396798014640808, "skip_count": 0.0, "step": 476, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 2.6006825938566553, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.59375, "learning_rate": 0.000954, "loss": 0.1584, "macro_f1": 0.4400000274181366, "num_tokens": 755653.0, "repeat_count": 0.0, "routers_loss": 0.09343712776899338, "skip_count": 3.0, "step": 478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.61160409556314, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.4375, "learning_rate": 0.000958, "loss": 0.2014, "macro_f1": 0.3272727429866791, "num_tokens": 758567.0, "repeat_count": 0.0, "routers_loss": 0.03879999741911888, "skip_count": 1.0, "step": 480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6225255972696244, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.000962, "loss": 0.2174, "macro_f1": 0.32098764181137085, "num_tokens": 762013.0, "repeat_count": 0.0, "routers_loss": 0.13902239501476288, "skip_count": 2.0, "step": 482, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.6334470989761094, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.000966, "loss": 0.2322, "macro_f1": 0.3272727429866791, "num_tokens": 764820.0, "repeat_count": 0.0, "routers_loss": 0.0281832292675972, "skip_count": 0.0, "step": 484, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 2.644368600682594, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.25, "learning_rate": 0.0009699999999999999, "loss": 0.178, "macro_f1": 0.29333335161209106, "num_tokens": 767962.0, "repeat_count": 0.0, "routers_loss": 0.3387240767478943, "skip_count": 2.0, "step": 486, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.6552901023890785, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.000974, "loss": 0.1818, "macro_f1": 0.32098764181137085, "num_tokens": 771189.0, "repeat_count": 0.0, "routers_loss": 0.033774666488170624, "skip_count": 0.0, "step": 488, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.666211604095563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.875, "learning_rate": 0.000978, "loss": 0.2071, "macro_f1": 0.3333333432674408, "num_tokens": 774073.0, "repeat_count": 0.0, "routers_loss": 0.009604716673493385, "skip_count": 0.0, "step": 490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6771331058020476, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.000982, "loss": 0.1853, "macro_f1": 0.3333333432674408, "num_tokens": 776722.0, "repeat_count": 0.0, "routers_loss": 0.0034638401120901108, "skip_count": 0.0, "step": 492, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.6880546075085325, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.6875, "learning_rate": 0.0009860000000000001, "loss": 0.2882, "macro_f1": 0.32098764181137085, "num_tokens": 780051.0, "repeat_count": 0.0, "routers_loss": 0.08520562946796417, "skip_count": 0.0, "step": 494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.698976109215017, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.875, "learning_rate": 0.00099, "loss": 0.1995, "macro_f1": 0.3272727429866791, "num_tokens": 782813.0, "repeat_count": 0.0, "routers_loss": 0.16369783878326416, "skip_count": 1.0, "step": 496, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.7098976109215016, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.000994, "loss": 0.1725, "macro_f1": 0.3006536066532135, "num_tokens": 785376.0, "repeat_count": 0.0, "routers_loss": 0.17243081331253052, "skip_count": 2.0, "step": 498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 26.0, "epoch": 2.7208191126279866, "f1_execute": 0.8749999403953552, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 9.9375, "learning_rate": 0.000998, "loss": 0.1842, "macro_f1": 0.402777761220932, "num_tokens": 788030.0, "repeat_count": 2.0, "routers_loss": 0.15272235870361328, "skip_count": 4.0, "step": 500, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.731740614334471, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.4375, "learning_rate": 0.0009999999674012276, "loss": 0.1709, "macro_f1": 0.32098764181137085, "num_tokens": 791099.0, "repeat_count": 0.0, "routers_loss": 0.02299564890563488, "skip_count": 0.0, "step": 502, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.7426621160409557, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.96875, "learning_rate": 0.000999999706611075, "loss": 0.1858, "macro_f1": 0.3144654333591461, "num_tokens": 794155.0, "repeat_count": 0.0, "routers_loss": 0.0592501275241375, "skip_count": 0.0, "step": 504, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.75358361774744, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.71875, "learning_rate": 0.0009999991850309056, "loss": 0.1347, "macro_f1": 0.307692289352417, "num_tokens": 797457.0, "repeat_count": 0.0, "routers_loss": 0.07785549014806747, "skip_count": 1.0, "step": 506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 2.7645051194539247, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 9.25, "learning_rate": 0.0009999984026609918, "loss": 0.1448, "macro_f1": 0.4803921580314636, "num_tokens": 800614.0, "repeat_count": 0.0, "routers_loss": 0.32612788677215576, "skip_count": 2.0, "step": 508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.7754266211604097, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.84375, "learning_rate": 0.0009999973595017412, "loss": 0.2566, "macro_f1": 0.3272727429866791, "num_tokens": 804027.0, "repeat_count": 0.0, "routers_loss": 0.03253546729683876, "skip_count": 0.0, "step": 510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 2.7863481228668943, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.6875, "learning_rate": 0.0009999960555536983, "loss": 0.1271, "macro_f1": 0.5359477400779724, "num_tokens": 807662.0, "repeat_count": 1.0, "routers_loss": 0.16023527085781097, "skip_count": 2.0, "step": 512, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.797269624573379, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.375, "learning_rate": 0.0009999944908175428, "loss": 0.1876, "macro_f1": 0.3272727429866791, "num_tokens": 810905.0, "repeat_count": 0.0, "routers_loss": 0.022885220125317574, "skip_count": 0.0, "step": 514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8081911262798633, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.078125, "learning_rate": 0.0009999926652940912, "loss": 0.1309, "macro_f1": 0.3333333432674408, "num_tokens": 814110.0, "repeat_count": 0.0, "routers_loss": 0.007647325750440359, "skip_count": 0.0, "step": 516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.819112627986348, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.375, "learning_rate": 0.0009999905789842955, "loss": 0.2302, "macro_f1": 0.32098767161369324, "num_tokens": 816905.0, "repeat_count": 1.0, "routers_loss": 0.0514276959002018, "skip_count": 0.0, "step": 518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.830034129692833, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.875, "learning_rate": 0.0009999882318892442, "loss": 0.2078, "macro_f1": 0.31446540355682373, "num_tokens": 819821.0, "repeat_count": 2.0, "routers_loss": 0.3009680211544037, "skip_count": 0.0, "step": 520, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.8409556313993174, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.859375, "learning_rate": 0.000999985624010161, "loss": 0.1296, "macro_f1": 0.32098767161369324, "num_tokens": 822580.0, "repeat_count": 0.0, "routers_loss": 0.05273444578051567, "skip_count": 1.0, "step": 522, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.5625, "learning_rate": 0.0009999827553484064, "loss": 0.2293, "macro_f1": 0.3333333432674408, "num_tokens": 825874.0, "repeat_count": 0.0, "routers_loss": 0.008311637677252293, "skip_count": 0.0, "step": 524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 2.862798634812287, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.921875, "learning_rate": 0.0009999796259054763, "loss": 0.1759, "macro_f1": 0.29333335161209106, "num_tokens": 829040.0, "repeat_count": 3.0, "routers_loss": 1.207849383354187, "skip_count": 2.0, "step": 526, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.8737201365187715, "f1_execute": 0.9019608497619629, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.65625, "learning_rate": 0.0009999762356830036, "loss": 0.2089, "macro_f1": 0.3006536364555359, "num_tokens": 834261.0, "repeat_count": 2.0, "routers_loss": 0.5721967220306396, "skip_count": 3.0, "step": 528, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.884641638225256, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.6875, "learning_rate": 0.000999972584682756, "loss": 0.2308, "macro_f1": 0.29333335161209106, "num_tokens": 837501.0, "repeat_count": 0.0, "routers_loss": 0.09908123314380646, "skip_count": 2.0, "step": 530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 2.8955631399317405, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.6875, "learning_rate": 0.0009999686729066381, "loss": 0.1818, "macro_f1": 0.32098764181137085, "num_tokens": 840390.0, "repeat_count": 0.0, "routers_loss": 0.04153004288673401, "skip_count": 0.0, "step": 532, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 2.906484641638225, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.09375, "learning_rate": 0.0009999645003566902, "loss": 0.1759, "macro_f1": 0.4400000274181366, "num_tokens": 843327.0, "repeat_count": 1.0, "routers_loss": 0.37754446268081665, "skip_count": 3.0, "step": 534, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.91740614334471, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.953125, "learning_rate": 0.0009999600670350882, "loss": 0.1873, "macro_f1": 0.4871794879436493, "num_tokens": 847028.0, "repeat_count": 0.0, "routers_loss": 0.03440186381340027, "skip_count": 2.0, "step": 536, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 2.9283276450511946, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.875, "learning_rate": 0.000999955372944145, "loss": 0.342, "macro_f1": 0.29333335161209106, "num_tokens": 850735.0, "repeat_count": 1.0, "routers_loss": 0.18292225897312164, "skip_count": 0.0, "step": 538, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.939249146757679, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.421875, "learning_rate": 0.0009999504180863087, "loss": 0.1714, "macro_f1": 0.32098764181137085, "num_tokens": 854731.0, "repeat_count": 1.0, "routers_loss": 0.31060779094696045, "skip_count": 1.0, "step": 540, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9501706484641637, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.65625, "learning_rate": 0.0009999452024641636, "loss": 0.1744, "macro_f1": 0.3144654333591461, "num_tokens": 858249.0, "repeat_count": 1.0, "routers_loss": 0.09356094151735306, "skip_count": 2.0, "step": 542, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.961092150170648, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.4375, "learning_rate": 0.0009999397260804302, "loss": 0.1456, "macro_f1": 0.3333333432674408, "num_tokens": 860901.0, "repeat_count": 0.0, "routers_loss": 0.006649349816143513, "skip_count": 0.0, "step": 544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 2.972013651877133, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.0, "learning_rate": 0.0009999339889379647, "loss": 0.191, "macro_f1": 0.3272727429866791, "num_tokens": 863756.0, "repeat_count": 0.0, "routers_loss": 0.024081196635961533, "skip_count": 0.0, "step": 546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 2.9829351535836177, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.328125, "learning_rate": 0.0009999279910397597, "loss": 0.1806, "macro_f1": 0.4871794879436493, "num_tokens": 867242.0, "repeat_count": 0.0, "routers_loss": 0.06612888723611832, "skip_count": 2.0, "step": 548, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 2.9938566552901023, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.09375, "learning_rate": 0.000999921732388943, "loss": 0.1438, "macro_f1": 0.32098764181137085, "num_tokens": 870235.0, "repeat_count": 0.0, "routers_loss": 0.02564089559018612, "skip_count": 0.0, "step": 550, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.0, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.0, "learning_rate": 0.0009999152129887801, "loss": 0.1395, "macro_f1": 0.3006536066532135, "num_tokens": 872748.0, "repeat_count": 1.0, "routers_loss": 0.31180688738822937, "skip_count": 2.0, "step": 552, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.8333333134651184, "avg_layers": 25.0, "epoch": 3.0109215017064845, "f1_execute": 0.9523809552192688, "f1_repeat": 0.6666666865348816, "f1_skip": 0.9090909361839294, "grad_norm": 7.8125, "learning_rate": 0.0009999084328426704, "loss": 0.1243, "macro_f1": 0.8427128791809082, "num_tokens": 876257.0, "repeat_count": 1.0, "routers_loss": 0.06441941112279892, "skip_count": 6.0, "step": 554, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.021843003412969, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.4375, "learning_rate": 0.0009999013919541506, "loss": 0.2276, "macro_f1": 0.32098764181137085, "num_tokens": 879189.0, "repeat_count": 0.0, "routers_loss": 0.1297590732574463, "skip_count": 2.0, "step": 556, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.032764505119454, "f1_execute": 0.95652174949646, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5714285373687744, "grad_norm": 2.953125, "learning_rate": 0.0009998940903268932, "loss": 0.1034, "macro_f1": 0.7315390110015869, "num_tokens": 882626.0, "repeat_count": 2.0, "routers_loss": 0.40159890055656433, "skip_count": 4.0, "step": 558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.0436860068259386, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.96875, "learning_rate": 0.0009998865279647066, "loss": 0.1627, "macro_f1": 0.307692289352417, "num_tokens": 885572.0, "repeat_count": 0.0, "routers_loss": 0.05809749290347099, "skip_count": 3.0, "step": 560, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.054607508532423, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.8125, "learning_rate": 0.0009998787048715349, "loss": 0.1533, "macro_f1": 0.31446540355682373, "num_tokens": 889088.0, "repeat_count": 0.0, "routers_loss": 0.4470720589160919, "skip_count": 2.0, "step": 562, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.0655290102389077, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.078125, "learning_rate": 0.0009998706210514589, "loss": 0.167, "macro_f1": 0.3272727429866791, "num_tokens": 892449.0, "repeat_count": 0.0, "routers_loss": 0.017404144629836082, "skip_count": 0.0, "step": 564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 24.0, "epoch": 3.0764505119453927, "f1_execute": 0.8749999403953552, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5, "learning_rate": 0.0009998622765086946, "loss": 0.1492, "macro_f1": 0.2916666567325592, "num_tokens": 895586.0, "repeat_count": 1.0, "routers_loss": 0.3639675974845886, "skip_count": 1.0, "step": 566, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 3.087372013651877, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.3333333134651184, "grad_norm": 9.0625, "learning_rate": 0.0009998536712475944, "loss": 0.2095, "macro_f1": 0.4104308485984802, "num_tokens": 898285.0, "repeat_count": 1.0, "routers_loss": 0.16401837766170502, "skip_count": 1.0, "step": 568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.0982935153583617, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.921875, "learning_rate": 0.0009998448052726467, "loss": 0.1679, "macro_f1": 0.5427350401878357, "num_tokens": 901345.0, "repeat_count": 1.0, "routers_loss": 0.2740897238254547, "skip_count": 1.0, "step": 570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1092150170648463, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.921875, "learning_rate": 0.000999835678588476, "loss": 0.1513, "macro_f1": 0.3333333432674408, "num_tokens": 904674.0, "repeat_count": 0.0, "routers_loss": 0.004289933945983648, "skip_count": 0.0, "step": 572, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 3.1201365187713312, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.125, "learning_rate": 0.0009998262911998423, "loss": 0.2076, "macro_f1": 0.47333335876464844, "num_tokens": 908392.0, "repeat_count": 1.0, "routers_loss": 0.6915572881698608, "skip_count": 3.0, "step": 574, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 3.131058020477816, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.65625, "learning_rate": 0.000999816643111642, "loss": 0.166, "macro_f1": 0.47959184646606445, "num_tokens": 911574.0, "repeat_count": 3.0, "routers_loss": 0.27853959798812866, "skip_count": 1.0, "step": 576, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.1419795221843003, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.6875, "learning_rate": 0.0009998067343289074, "loss": 0.2197, "macro_f1": 0.3076923191547394, "num_tokens": 914726.0, "repeat_count": 1.0, "routers_loss": 0.39462774991989136, "skip_count": 1.0, "step": 578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.152901023890785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.515625, "learning_rate": 0.0009997965648568066, "loss": 0.1345, "macro_f1": 0.3333333432674408, "num_tokens": 918249.0, "repeat_count": 0.0, "routers_loss": 0.0032140507828444242, "skip_count": 0.0, "step": 580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1638225255972694, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.1875, "learning_rate": 0.000999786134700644, "loss": 0.1132, "macro_f1": 0.3333333432674408, "num_tokens": 921025.0, "repeat_count": 0.0, "routers_loss": 0.0016512145521119237, "skip_count": 0.0, "step": 582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 3.1747440273037544, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.59375, "learning_rate": 0.0009997754438658595, "loss": 0.0915, "macro_f1": 0.3006536066532135, "num_tokens": 924102.0, "repeat_count": 0.0, "routers_loss": 0.6956021785736084, "skip_count": 2.0, "step": 584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 3.185665529010239, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 9.1875, "learning_rate": 0.0009997644923580293, "loss": 0.1437, "macro_f1": 0.5359477400779724, "num_tokens": 927662.0, "repeat_count": 1.0, "routers_loss": 0.32544562220573425, "skip_count": 2.0, "step": 586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.25, "learning_rate": 0.0009997532801828658, "loss": 0.1488, "macro_f1": 0.3333333432674408, "num_tokens": 930556.0, "repeat_count": 0.0, "routers_loss": 0.00869440846145153, "skip_count": 0.0, "step": 588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.207508532423208, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.65625, "learning_rate": 0.0009997418073462167, "loss": 0.1584, "macro_f1": 0.32098764181137085, "num_tokens": 933435.0, "repeat_count": 0.0, "routers_loss": 0.08498232066631317, "skip_count": 2.0, "step": 590, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.218430034129693, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.015625, "learning_rate": 0.0009997300738540662, "loss": 0.1075, "macro_f1": 0.32098764181137085, "num_tokens": 936478.0, "repeat_count": 0.0, "routers_loss": 0.19423364102840424, "skip_count": 2.0, "step": 592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.2293515358361775, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.03125, "learning_rate": 0.000999718079712534, "loss": 0.1615, "macro_f1": 0.5492662787437439, "num_tokens": 939400.0, "repeat_count": 0.0, "routers_loss": 0.02402239292860031, "skip_count": 1.0, "step": 594, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.240273037542662, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.875, "learning_rate": 0.0009997058249278763, "loss": 0.221, "macro_f1": 0.6666666865348816, "num_tokens": 943300.0, "repeat_count": 1.0, "routers_loss": 0.0028402789030224085, "skip_count": 0.0, "step": 596, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2511945392491466, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.0009996933095064847, "loss": 0.1423, "macro_f1": 0.3144654333591461, "num_tokens": 947399.0, "repeat_count": 1.0, "routers_loss": 0.2962486445903778, "skip_count": 2.0, "step": 598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2621160409556316, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.625, "learning_rate": 0.0009996805334548872, "loss": 0.1535, "macro_f1": 0.29333335161209106, "num_tokens": 950094.0, "repeat_count": 0.0, "routers_loss": 0.47425299882888794, "skip_count": 4.0, "step": 600, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.4000000059604645, "avg_layers": 24.0, "epoch": 3.273037542662116, "f1_execute": 0.8636363744735718, "f1_repeat": 0.0, "f1_skip": 0.444444477558136, "grad_norm": 4.71875, "learning_rate": 0.0009996674967797476, "loss": 0.1282, "macro_f1": 0.43602699041366577, "num_tokens": 953673.0, "repeat_count": 3.0, "routers_loss": 0.3788261115550995, "skip_count": 5.0, "step": 602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.2839590443686006, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5625, "learning_rate": 0.0009996541994878655, "loss": 0.1239, "macro_f1": 0.3272727429866791, "num_tokens": 956885.0, "repeat_count": 1.0, "routers_loss": 0.13212358951568604, "skip_count": 0.0, "step": 604, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.294880546075085, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.828125, "learning_rate": 0.0009996406415861763, "loss": 0.0874, "macro_f1": 0.6601307392120361, "num_tokens": 959794.0, "repeat_count": 0.0, "routers_loss": 0.0332571342587471, "skip_count": 2.0, "step": 606, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3058020477815697, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.0009996268230817518, "loss": 0.1068, "macro_f1": 0.3333333432674408, "num_tokens": 963516.0, "repeat_count": 0.0, "routers_loss": 0.007200752384960651, "skip_count": 0.0, "step": 608, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3167235494880547, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.75, "learning_rate": 0.0009996127439817993, "loss": 0.1237, "macro_f1": 0.3272727429866791, "num_tokens": 966363.0, "repeat_count": 0.0, "routers_loss": 0.23764896392822266, "skip_count": 1.0, "step": 610, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.3276450511945392, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.0009995984042936621, "loss": 0.1411, "macro_f1": 0.3333333432674408, "num_tokens": 969265.0, "repeat_count": 0.0, "routers_loss": 0.0006030416116118431, "skip_count": 0.0, "step": 612, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.3385665529010238, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.8125, "learning_rate": 0.0009995838040248197, "loss": 0.1516, "macro_f1": 0.5492662787437439, "num_tokens": 972024.0, "repeat_count": 0.0, "routers_loss": 0.029178157448768616, "skip_count": 1.0, "step": 614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 3.3494880546075088, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.0625, "learning_rate": 0.0009995689431828872, "loss": 0.132, "macro_f1": 0.41777777671813965, "num_tokens": 974328.0, "repeat_count": 0.0, "routers_loss": 0.41580793261528015, "skip_count": 2.0, "step": 616, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.3604095563139933, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.625, "learning_rate": 0.000999553821775616, "loss": 0.1495, "macro_f1": 0.307692289352417, "num_tokens": 977628.0, "repeat_count": 0.0, "routers_loss": 0.26905494928359985, "skip_count": 3.0, "step": 618, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.371331058020478, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.0009995384398108927, "loss": 0.1372, "macro_f1": 0.3333333432674408, "num_tokens": 980458.0, "repeat_count": 0.0, "routers_loss": 0.007225328590720892, "skip_count": 0.0, "step": 620, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.3822525597269624, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.984375, "learning_rate": 0.0009995227972967404, "loss": 0.1104, "macro_f1": 0.6603773832321167, "num_tokens": 983776.0, "repeat_count": 1.0, "routers_loss": 0.09698990732431412, "skip_count": 1.0, "step": 622, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.393174061433447, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.40625, "learning_rate": 0.000999506894241318, "loss": 0.1211, "macro_f1": 0.32098764181137085, "num_tokens": 986625.0, "repeat_count": 0.0, "routers_loss": 0.028710627928376198, "skip_count": 0.0, "step": 624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.404095563139932, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.53125, "learning_rate": 0.0009994907306529201, "loss": 0.186, "macro_f1": 0.5427350401878357, "num_tokens": 989896.0, "repeat_count": 1.0, "routers_loss": 0.18436689674854279, "skip_count": 2.0, "step": 626, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.4150170648464164, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.71875, "learning_rate": 0.0009994743065399776, "loss": 0.1819, "macro_f1": 0.6666666865348816, "num_tokens": 992963.0, "repeat_count": 0.0, "routers_loss": 0.011628196574747562, "skip_count": 2.0, "step": 628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.425938566552901, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.5625, "learning_rate": 0.0009994576219110565, "loss": 0.2279, "macro_f1": 0.3272727429866791, "num_tokens": 995486.0, "repeat_count": 0.0, "routers_loss": 0.03694930672645569, "skip_count": 0.0, "step": 630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.4368600682593855, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.0009994406767748596, "loss": 0.2908, "macro_f1": 0.3076923191547394, "num_tokens": 998880.0, "repeat_count": 1.0, "routers_loss": 0.3335764706134796, "skip_count": 1.0, "step": 632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.4477815699658705, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 5.3125, "learning_rate": 0.000999423471140225, "loss": 0.1652, "macro_f1": 0.4871794879436493, "num_tokens": 1001623.0, "repeat_count": 0.0, "routers_loss": 0.03843867778778076, "skip_count": 2.0, "step": 634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.458703071672355, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.0009994060050161268, "loss": 0.1534, "macro_f1": 0.307692289352417, "num_tokens": 1004900.0, "repeat_count": 2.0, "routers_loss": 0.26561209559440613, "skip_count": 1.0, "step": 636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 3.4696245733788396, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 6.40625, "learning_rate": 0.0009993882784116752, "loss": 0.147, "macro_f1": 0.4803921580314636, "num_tokens": 1008732.0, "repeat_count": 0.0, "routers_loss": 0.3012487590312958, "skip_count": 3.0, "step": 638, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.480546075085324, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.15625, "learning_rate": 0.0009993702913361155, "loss": 0.1252, "macro_f1": 0.3333333432674408, "num_tokens": 1011699.0, "repeat_count": 0.0, "routers_loss": 0.012646762654185295, "skip_count": 0.0, "step": 640, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 3.491467576791809, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.90625, "learning_rate": 0.0009993520437988302, "loss": 0.1487, "macro_f1": 0.480392187833786, "num_tokens": 1014406.0, "repeat_count": 1.0, "routers_loss": 0.1068505123257637, "skip_count": 3.0, "step": 642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.5023890784982936, "f1_execute": 0.8085106015205383, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.000999333535809336, "loss": 0.1731, "macro_f1": 0.26950353384017944, "num_tokens": 1017801.0, "repeat_count": 2.0, "routers_loss": 2.2939841747283936, "skip_count": 5.0, "step": 644, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.513310580204778, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.8125, "learning_rate": 0.0009993147673772868, "loss": 0.1609, "macro_f1": 0.3272727429866791, "num_tokens": 1021185.0, "repeat_count": 0.0, "routers_loss": 0.02110578864812851, "skip_count": 0.0, "step": 646, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.5242320819112627, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 4.90625, "learning_rate": 0.000999295738512472, "loss": 0.124, "macro_f1": 0.4533333480358124, "num_tokens": 1025108.0, "repeat_count": 0.0, "routers_loss": 0.15021832287311554, "skip_count": 2.0, "step": 648, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5351535836177472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0, "learning_rate": 0.0009992764492248163, "loss": 0.2309, "macro_f1": 0.3333333432674408, "num_tokens": 1028805.0, "repeat_count": 0.0, "routers_loss": 0.002900304039940238, "skip_count": 0.0, "step": 650, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 3.546075085324232, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 5.0, "learning_rate": 0.0009992568995243808, "loss": 0.1452, "macro_f1": 0.44705885648727417, "num_tokens": 1032069.0, "repeat_count": 0.0, "routers_loss": 0.2886044383049011, "skip_count": 3.0, "step": 652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5569965870307167, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.0009992370894213623, "loss": 0.1319, "macro_f1": 0.3144654333591461, "num_tokens": 1035634.0, "repeat_count": 1.0, "routers_loss": 0.42971259355545044, "skip_count": 2.0, "step": 654, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 3.5679180887372013, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 7.375, "learning_rate": 0.000999217018926093, "loss": 0.1152, "macro_f1": 0.7795917987823486, "num_tokens": 1039948.0, "repeat_count": 1.0, "routers_loss": 0.07567094266414642, "skip_count": 3.0, "step": 656, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.5788395904436863, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.46875, "learning_rate": 0.0009991966880490417, "loss": 0.1425, "macro_f1": 0.3333333432674408, "num_tokens": 1043710.0, "repeat_count": 0.0, "routers_loss": 0.001569207408465445, "skip_count": 0.0, "step": 658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.589761092150171, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.453125, "learning_rate": 0.0009991760968008124, "loss": 0.1177, "macro_f1": 0.3333333432674408, "num_tokens": 1047211.0, "repeat_count": 0.0, "routers_loss": 0.014489148743450642, "skip_count": 0.0, "step": 660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.6006825938566553, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.34375, "learning_rate": 0.0009991552451921453, "loss": 0.104, "macro_f1": 0.32098767161369324, "num_tokens": 1050220.0, "repeat_count": 0.0, "routers_loss": 0.052834026515483856, "skip_count": 1.0, "step": 662, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.61160409556314, "f1_execute": 0.875, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.0009991341332339157, "loss": 0.1706, "macro_f1": 0.625, "num_tokens": 1053982.0, "repeat_count": 1.0, "routers_loss": 0.2865705192089081, "skip_count": 3.0, "step": 664, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 3.6225255972696244, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.25, "learning_rate": 0.0009991127609371357, "loss": 0.1275, "macro_f1": 0.307692289352417, "num_tokens": 1056846.0, "repeat_count": 1.0, "routers_loss": 0.32878634333610535, "skip_count": 0.0, "step": 666, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 3.6334470989761094, "f1_execute": 0.9777777791023254, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 3.328125, "learning_rate": 0.0009990911283129524, "loss": 0.1348, "macro_f1": 0.8814815282821655, "num_tokens": 1059648.0, "repeat_count": 2.0, "routers_loss": 0.10558832436800003, "skip_count": 4.0, "step": 668, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 3.644368600682594, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.90625, "learning_rate": 0.0009990692353726489, "loss": 0.0572, "macro_f1": 0.6666666865348816, "num_tokens": 1062290.0, "repeat_count": 0.0, "routers_loss": 0.0071791489608585835, "skip_count": 2.0, "step": 670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.6552901023890785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.0625, "learning_rate": 0.0009990470821276442, "loss": 0.156, "macro_f1": 0.3272727429866791, "num_tokens": 1065212.0, "repeat_count": 0.0, "routers_loss": 0.028384100645780563, "skip_count": 0.0, "step": 672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.666211604095563, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 7.4375, "learning_rate": 0.0009990246685894933, "loss": 0.1457, "macro_f1": 0.4871794879436493, "num_tokens": 1068029.0, "repeat_count": 0.0, "routers_loss": 0.03461477532982826, "skip_count": 2.0, "step": 674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.6771331058020476, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.78125, "learning_rate": 0.0009990019947698863, "loss": 0.1055, "macro_f1": 0.3333333432674408, "num_tokens": 1071229.0, "repeat_count": 0.0, "routers_loss": 0.004003713373094797, "skip_count": 0.0, "step": 676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 3.6880546075085325, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 2.015625, "learning_rate": 0.0009989790606806494, "loss": 0.1026, "macro_f1": 0.5934640765190125, "num_tokens": 1074046.0, "repeat_count": 0.0, "routers_loss": 0.03134514391422272, "skip_count": 3.0, "step": 678, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.698976109215017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.71875, "learning_rate": 0.0009989558663337447, "loss": 0.1402, "macro_f1": 0.6666666865348816, "num_tokens": 1076635.0, "repeat_count": 0.0, "routers_loss": 0.00439166184514761, "skip_count": 1.0, "step": 680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.7098976109215016, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.09375, "learning_rate": 0.0009989324117412699, "loss": 0.1021, "macro_f1": 0.31446540355682373, "num_tokens": 1079958.0, "repeat_count": 0.0, "routers_loss": 0.12589046359062195, "skip_count": 2.0, "step": 682, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.7208191126279866, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.0009989086969154587, "loss": 0.1762, "macro_f1": 0.3333333432674408, "num_tokens": 1082589.0, "repeat_count": 0.0, "routers_loss": 0.01050520222634077, "skip_count": 0.0, "step": 684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.731740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.0009988847218686796, "loss": 0.1527, "macro_f1": 0.3272727429866791, "num_tokens": 1085634.0, "repeat_count": 0.0, "routers_loss": 0.08884720504283905, "skip_count": 1.0, "step": 686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 3.7426621160409557, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.5625, "learning_rate": 0.0009988604866134384, "loss": 0.196, "macro_f1": 0.29333335161209106, "num_tokens": 1088501.0, "repeat_count": 1.0, "routers_loss": 0.3627224862575531, "skip_count": 2.0, "step": 688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.75358361774744, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.9375, "learning_rate": 0.0009988359911623748, "loss": 0.2456, "macro_f1": 0.3272727429866791, "num_tokens": 1091083.0, "repeat_count": 0.0, "routers_loss": 0.025369791314005852, "skip_count": 0.0, "step": 690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.7645051194539247, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.46875, "learning_rate": 0.000998811235528266, "loss": 0.1186, "macro_f1": 0.3272727429866791, "num_tokens": 1095673.0, "repeat_count": 0.0, "routers_loss": 0.023373540490865707, "skip_count": 0.0, "step": 692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.7754266211604097, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.03125, "learning_rate": 0.0009987862197240237, "loss": 0.1518, "macro_f1": 0.3272727429866791, "num_tokens": 1098519.0, "repeat_count": 0.0, "routers_loss": 0.014006087556481361, "skip_count": 0.0, "step": 694, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.7863481228668943, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.59375, "learning_rate": 0.0009987609437626954, "loss": 0.2149, "macro_f1": 0.31446540355682373, "num_tokens": 1101510.0, "repeat_count": 0.0, "routers_loss": 0.057559430599212646, "skip_count": 1.0, "step": 696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.797269624573379, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.765625, "learning_rate": 0.0009987354076574648, "loss": 0.1507, "macro_f1": 0.3333333432674408, "num_tokens": 1104637.0, "repeat_count": 0.0, "routers_loss": 0.001837484072893858, "skip_count": 0.0, "step": 698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8081911262798633, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.0009987096114216511, "loss": 0.1046, "macro_f1": 0.3272727429866791, "num_tokens": 1107964.0, "repeat_count": 0.0, "routers_loss": 0.3758608400821686, "skip_count": 1.0, "step": 700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 3.819112627986348, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 4.375, "learning_rate": 0.000998683555068709, "loss": 0.1269, "macro_f1": 0.5934640765190125, "num_tokens": 1111541.0, "repeat_count": 0.0, "routers_loss": 0.02019377611577511, "skip_count": 2.0, "step": 702, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.830034129692833, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.671875, "learning_rate": 0.000998657238612229, "loss": 0.1522, "macro_f1": 0.3272727429866791, "num_tokens": 1114819.0, "repeat_count": 0.0, "routers_loss": 0.019685756415128708, "skip_count": 0.0, "step": 704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8409556313993174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.484375, "learning_rate": 0.0009986306620659374, "loss": 0.1104, "macro_f1": 0.3333333432674408, "num_tokens": 1117888.0, "repeat_count": 0.0, "routers_loss": 0.0059326752088963985, "skip_count": 0.0, "step": 706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 3.851877133105802, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.125, "learning_rate": 0.0009986038254436956, "loss": 0.1038, "macro_f1": 0.32098764181137085, "num_tokens": 1120946.0, "repeat_count": 0.0, "routers_loss": 0.022552471607923508, "skip_count": 0.0, "step": 708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.862798634812287, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.625, "learning_rate": 0.0009985767287595015, "loss": 0.1433, "macro_f1": 0.4871794879436493, "num_tokens": 1124013.0, "repeat_count": 0.0, "routers_loss": 0.03914980590343475, "skip_count": 2.0, "step": 710, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 3.8737201365187715, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 5.0625, "learning_rate": 0.0009985493720274879, "loss": 0.1663, "macro_f1": 1.0, "num_tokens": 1127662.0, "repeat_count": 1.0, "routers_loss": 0.01359120849519968, "skip_count": 2.0, "step": 712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.884641638225256, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.96875, "learning_rate": 0.0009985217552619236, "loss": 0.1134, "macro_f1": 0.3272727429866791, "num_tokens": 1130742.0, "repeat_count": 0.0, "routers_loss": 0.0699341893196106, "skip_count": 0.0, "step": 714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.8955631399317405, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.75, "learning_rate": 0.000998493878477213, "loss": 0.1643, "macro_f1": 0.3333333432674408, "num_tokens": 1133386.0, "repeat_count": 0.0, "routers_loss": 0.006396451499313116, "skip_count": 0.0, "step": 716, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 3.906484641638225, "f1_execute": 0.8292683362960815, "f1_repeat": 0.3333333432674408, "f1_skip": 0.6666666865348816, "grad_norm": 4.75, "learning_rate": 0.0009984657416878962, "loss": 0.1396, "macro_f1": 0.6097561120986938, "num_tokens": 1136071.0, "repeat_count": 3.0, "routers_loss": 0.23587316274642944, "skip_count": 6.0, "step": 718, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.91740614334471, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 9.625, "learning_rate": 0.0009984373449086485, "loss": 0.1686, "macro_f1": 0.3076923191547394, "num_tokens": 1139061.0, "repeat_count": 0.0, "routers_loss": 0.23841485381126404, "skip_count": 2.0, "step": 720, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.9283276450511946, "f1_execute": 0.9200000166893005, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 2.8125, "learning_rate": 0.0009984086881542815, "loss": 0.1112, "macro_f1": 0.5288889408111572, "num_tokens": 1141926.0, "repeat_count": 2.0, "routers_loss": 0.37492331862449646, "skip_count": 3.0, "step": 722, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 3.939249146757679, "f1_execute": 0.9166666865348816, "f1_repeat": 0.6666666865348816, "f1_skip": 0.4000000059604645, "grad_norm": 4.375, "learning_rate": 0.0009983797714397415, "loss": 0.1395, "macro_f1": 0.6611111164093018, "num_tokens": 1145302.0, "repeat_count": 2.0, "routers_loss": 0.5061943531036377, "skip_count": 2.0, "step": 724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.9501706484641637, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 11.5625, "learning_rate": 0.0009983505947801115, "loss": 0.327, "macro_f1": 0.3272727429866791, "num_tokens": 1148991.0, "repeat_count": 0.0, "routers_loss": 0.030050436034798622, "skip_count": 0.0, "step": 726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 3.961092150170648, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.84375, "learning_rate": 0.0009983211581906088, "loss": 0.2311, "macro_f1": 0.5492662787437439, "num_tokens": 1151711.0, "repeat_count": 0.0, "routers_loss": 0.04163246229290962, "skip_count": 2.0, "step": 728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 3.972013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.0009982914616865875, "loss": 0.1956, "macro_f1": 0.3333333432674408, "num_tokens": 1155061.0, "repeat_count": 0.0, "routers_loss": 0.002654903568327427, "skip_count": 0.0, "step": 730, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 3.9829351535836177, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009982615052835364, "loss": 0.1239, "macro_f1": 0.31446540355682373, "num_tokens": 1158043.0, "repeat_count": 0.0, "routers_loss": 0.18476539850234985, "skip_count": 2.0, "step": 732, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 3.9938566552901023, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.625, "learning_rate": 0.0009982312889970804, "loss": 0.211, "macro_f1": 0.31446540355682373, "num_tokens": 1161487.0, "repeat_count": 2.0, "routers_loss": 0.33558642864227295, "skip_count": 0.0, "step": 734, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.0, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.5625, "learning_rate": 0.0009982008128429794, "loss": 0.14, "macro_f1": 0.3272727429866791, "num_tokens": 1163664.0, "repeat_count": 0.0, "routers_loss": 0.010565636679530144, "skip_count": 0.0, "step": 736, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.010921501706485, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.75, "learning_rate": 0.0009981700768371296, "loss": 0.0823, "macro_f1": 0.3333333432674408, "num_tokens": 1166461.0, "repeat_count": 0.0, "routers_loss": 0.001561413868330419, "skip_count": 0.0, "step": 738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.021843003412969, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.125, "learning_rate": 0.000998139080995562, "loss": 0.1766, "macro_f1": 0.6666666865348816, "num_tokens": 1170134.0, "repeat_count": 0.0, "routers_loss": 0.010665918700397015, "skip_count": 2.0, "step": 740, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.032764505119454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.578125, "learning_rate": 0.0009981078253344432, "loss": 0.1177, "macro_f1": 0.3333333432674408, "num_tokens": 1173075.0, "repeat_count": 0.0, "routers_loss": 0.047345057129859924, "skip_count": 1.0, "step": 742, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.043686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.1875, "learning_rate": 0.000998076309870076, "loss": 0.0517, "macro_f1": 0.6666666865348816, "num_tokens": 1176281.0, "repeat_count": 0.0, "routers_loss": 0.0033105311449617147, "skip_count": 1.0, "step": 744, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.054607508532423, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.000998044534618898, "loss": 0.0864, "macro_f1": 0.32098764181137085, "num_tokens": 1179403.0, "repeat_count": 0.0, "routers_loss": 0.033084314316511154, "skip_count": 0.0, "step": 746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.065529010238908, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.234375, "learning_rate": 0.0009980124995974827, "loss": 0.0925, "macro_f1": 0.3006536066532135, "num_tokens": 1182596.0, "repeat_count": 1.0, "routers_loss": 0.21827591955661774, "skip_count": 3.0, "step": 748, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 4.076450511945392, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.125, "learning_rate": 0.0009979802048225388, "loss": 0.1244, "macro_f1": 0.4871794879436493, "num_tokens": 1186303.0, "repeat_count": 0.0, "routers_loss": 0.18225915729999542, "skip_count": 3.0, "step": 750, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 4.087372013651877, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 3.984375, "learning_rate": 0.0009979476503109107, "loss": 0.0728, "macro_f1": 0.5492662787437439, "num_tokens": 1189299.0, "repeat_count": 1.0, "routers_loss": 0.03163563460111618, "skip_count": 0.0, "step": 752, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 4.098293515358362, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 6.34375, "learning_rate": 0.000997914836079578, "loss": 0.148, "macro_f1": 0.41777777671813965, "num_tokens": 1192694.0, "repeat_count": 0.0, "routers_loss": 0.28674715757369995, "skip_count": 2.0, "step": 754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.109215017064846, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.34375, "learning_rate": 0.0009978817621456562, "loss": 0.0869, "macro_f1": 0.31446540355682373, "num_tokens": 1196319.0, "repeat_count": 0.0, "routers_loss": 0.05852695554494858, "skip_count": 1.0, "step": 756, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.120136518771331, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.6484375, "learning_rate": 0.000997848428526396, "loss": 0.0648, "macro_f1": 0.5492662787437439, "num_tokens": 1199844.0, "repeat_count": 0.0, "routers_loss": 0.06834150850772858, "skip_count": 2.0, "step": 758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.131058020477815, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.375, "learning_rate": 0.0009978148352391835, "loss": 0.0801, "macro_f1": 0.3333333432674408, "num_tokens": 1202876.0, "repeat_count": 0.0, "routers_loss": 0.0058227707631886005, "skip_count": 0.0, "step": 760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 4.1419795221843, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.296875, "learning_rate": 0.00099778098230154, "loss": 0.1094, "macro_f1": 0.4871794879436493, "num_tokens": 1206870.0, "repeat_count": 0.0, "routers_loss": 0.079805389046669, "skip_count": 3.0, "step": 762, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.152901023890785, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.0009977468697311232, "loss": 0.0902, "macro_f1": 0.3076923191547394, "num_tokens": 1209825.0, "repeat_count": 0.0, "routers_loss": 0.21695999801158905, "skip_count": 2.0, "step": 764, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.163822525597269, "f1_execute": 0.8749999403953552, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 3.265625, "learning_rate": 0.0009977124975457249, "loss": 0.1244, "macro_f1": 0.5138888955116272, "num_tokens": 1213093.0, "repeat_count": 2.0, "routers_loss": 0.12744387984275818, "skip_count": 4.0, "step": 766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 4.174744027303754, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.34375, "learning_rate": 0.0009976778657632733, "loss": 0.0783, "macro_f1": 0.5427350401878357, "num_tokens": 1216291.0, "repeat_count": 0.0, "routers_loss": 0.07573267817497253, "skip_count": 2.0, "step": 768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.1856655290102385, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.0009976429744018313, "loss": 0.0752, "macro_f1": 0.3333333432674408, "num_tokens": 1219537.0, "repeat_count": 0.0, "routers_loss": 0.0009250715957023203, "skip_count": 0.0, "step": 770, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.234375, "learning_rate": 0.0009976078234795983, "loss": 0.1114, "macro_f1": 0.3333333432674408, "num_tokens": 1222736.0, "repeat_count": 0.0, "routers_loss": 0.00175693747587502, "skip_count": 0.0, "step": 772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.390625, "learning_rate": 0.0009975724130149076, "loss": 0.0918, "macro_f1": 0.5492662787437439, "num_tokens": 1226120.0, "repeat_count": 0.0, "routers_loss": 0.027441009879112244, "skip_count": 2.0, "step": 774, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.2184300341296925, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009975367430262287, "loss": 0.0992, "macro_f1": 0.3272727429866791, "num_tokens": 1228810.0, "repeat_count": 0.0, "routers_loss": 0.027025407180190086, "skip_count": 0.0, "step": 776, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.2293515358361775, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.453125, "learning_rate": 0.0009975008135321667, "loss": 0.0931, "macro_f1": 0.3333333432674408, "num_tokens": 1231669.0, "repeat_count": 0.0, "routers_loss": 0.00917113944888115, "skip_count": 0.0, "step": 778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.2402730375426625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.0009974646245514615, "loss": 0.0505, "macro_f1": 0.3333333432674408, "num_tokens": 1234476.0, "repeat_count": 0.0, "routers_loss": 0.010482276789844036, "skip_count": 0.0, "step": 780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.251194539249147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 4.28125, "learning_rate": 0.0009974281761029886, "loss": 0.0675, "macro_f1": 0.6666666865348816, "num_tokens": 1237748.0, "repeat_count": 0.0, "routers_loss": 0.009005382657051086, "skip_count": 1.0, "step": 782, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.262116040955632, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.9375, "learning_rate": 0.0009973914682057587, "loss": 0.1734, "macro_f1": 0.4871794879436493, "num_tokens": 1240362.0, "repeat_count": 0.0, "routers_loss": 0.09049399197101593, "skip_count": 2.0, "step": 784, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.273037542662116, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.984375, "learning_rate": 0.0009973545008789182, "loss": 0.1156, "macro_f1": 0.3333333432674408, "num_tokens": 1244147.0, "repeat_count": 0.0, "routers_loss": 0.0037465172354131937, "skip_count": 0.0, "step": 786, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.283959044368601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.84375, "learning_rate": 0.000997317274141748, "loss": 0.1302, "macro_f1": 0.3333333432674408, "num_tokens": 1247058.0, "repeat_count": 0.0, "routers_loss": 0.002100529847666621, "skip_count": 0.0, "step": 788, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 4.294880546075086, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.3333333432674408, "grad_norm": 3.03125, "learning_rate": 0.0009972797880136654, "loss": 0.0771, "macro_f1": 0.41777777671813965, "num_tokens": 1250331.0, "repeat_count": 0.0, "routers_loss": 0.08377297967672348, "skip_count": 2.0, "step": 790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 4.30580204778157, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.421875, "learning_rate": 0.0009972420425142224, "loss": 0.0782, "macro_f1": 0.4871794879436493, "num_tokens": 1253848.0, "repeat_count": 0.0, "routers_loss": 0.06583717465400696, "skip_count": 2.0, "step": 792, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.316723549488055, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.28125, "learning_rate": 0.0009972040376631057, "loss": 0.1235, "macro_f1": 0.32098767161369324, "num_tokens": 1257122.0, "repeat_count": 0.0, "routers_loss": 0.12353084981441498, "skip_count": 1.0, "step": 794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.327645051194539, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.171875, "learning_rate": 0.0009971657734801384, "loss": 0.0899, "macro_f1": 0.3333333432674408, "num_tokens": 1261136.0, "repeat_count": 0.0, "routers_loss": 0.004150724504143, "skip_count": 0.0, "step": 796, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.338566552901024, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.0009971272499852784, "loss": 0.1815, "macro_f1": 0.3272727429866791, "num_tokens": 1264211.0, "repeat_count": 0.0, "routers_loss": 0.02800264209508896, "skip_count": 0.0, "step": 798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.349488054607509, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.125, "learning_rate": 0.0009970884671986187, "loss": 0.1118, "macro_f1": 0.5492662787437439, "num_tokens": 1266964.0, "repeat_count": 0.0, "routers_loss": 0.05382822826504707, "skip_count": 1.0, "step": 800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.360409556313993, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.1875, "learning_rate": 0.0009970494251403874, "loss": 0.1015, "macro_f1": 0.31446540355682373, "num_tokens": 1269856.0, "repeat_count": 0.0, "routers_loss": 0.20994320511817932, "skip_count": 2.0, "step": 802, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.371331058020478, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.000997010123830948, "loss": 0.1095, "macro_f1": 0.31446540355682373, "num_tokens": 1272945.0, "repeat_count": 0.0, "routers_loss": 0.07841377705335617, "skip_count": 1.0, "step": 804, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 4.382252559726963, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 6.5625, "learning_rate": 0.0009969705632907999, "loss": 0.1242, "macro_f1": 0.6666666865348816, "num_tokens": 1276127.0, "repeat_count": 2.0, "routers_loss": 0.008330464363098145, "skip_count": 0.0, "step": 806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.393174061433447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.4375, "learning_rate": 0.0009969307435405766, "loss": 0.1688, "macro_f1": 0.3333333432674408, "num_tokens": 1279056.0, "repeat_count": 0.0, "routers_loss": 0.004059277940541506, "skip_count": 0.0, "step": 808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.404095563139932, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.3125, "learning_rate": 0.0009968906646010474, "loss": 0.1232, "macro_f1": 0.3333333432674408, "num_tokens": 1282092.0, "repeat_count": 0.0, "routers_loss": 0.005245010834187269, "skip_count": 0.0, "step": 810, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.415017064846416, "f1_execute": 0.9411765336990356, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 7.3125, "learning_rate": 0.0009968503264931167, "loss": 0.0964, "macro_f1": 0.6470588445663452, "num_tokens": 1285759.0, "repeat_count": 1.0, "routers_loss": 0.04135916382074356, "skip_count": 0.0, "step": 812, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.425938566552901, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0, "learning_rate": 0.0009968097292378244, "loss": 0.1636, "macro_f1": 0.32098767161369324, "num_tokens": 1288141.0, "repeat_count": 0.0, "routers_loss": 0.11239507049322128, "skip_count": 1.0, "step": 814, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.436860068259386, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.71875, "learning_rate": 0.0009967688728563446, "loss": 0.1044, "macro_f1": 0.32098767161369324, "num_tokens": 1291293.0, "repeat_count": 1.0, "routers_loss": 0.3831826150417328, "skip_count": 0.0, "step": 816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.44778156996587, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.875, "learning_rate": 0.0009967277573699875, "loss": 0.1445, "macro_f1": 0.32098764181137085, "num_tokens": 1293847.0, "repeat_count": 0.0, "routers_loss": 0.054437290877103806, "skip_count": 0.0, "step": 818, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.458703071672355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.859375, "learning_rate": 0.000996686382800198, "loss": 0.0712, "macro_f1": 0.3333333432674408, "num_tokens": 1296724.0, "repeat_count": 0.0, "routers_loss": 0.012091469950973988, "skip_count": 0.0, "step": 820, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 24.0, "epoch": 4.46962457337884, "f1_execute": 0.936170220375061, "f1_repeat": 0.0, "f1_skip": 0.75, "grad_norm": 4.4375, "learning_rate": 0.000996644749168557, "loss": 0.1332, "macro_f1": 0.5620567798614502, "num_tokens": 1299674.0, "repeat_count": 1.0, "routers_loss": 0.06590834259986877, "skip_count": 4.0, "step": 822, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 4.480546075085324, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 3.265625, "learning_rate": 0.0009966028564967785, "loss": 0.1285, "macro_f1": 0.4400000274181366, "num_tokens": 1302843.0, "repeat_count": 1.0, "routers_loss": 0.06902799010276794, "skip_count": 2.0, "step": 824, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 25.0, "epoch": 4.491467576791809, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 7.4375, "learning_rate": 0.0009965607048067137, "loss": 0.1249, "macro_f1": 0.44705885648727417, "num_tokens": 1305575.0, "repeat_count": 0.0, "routers_loss": 0.08320864289999008, "skip_count": 2.0, "step": 826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.502389078498293, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.65625, "learning_rate": 0.0009965182941203481, "loss": 0.1834, "macro_f1": 0.32098767161369324, "num_tokens": 1308244.0, "repeat_count": 0.0, "routers_loss": 0.12352414429187775, "skip_count": 1.0, "step": 828, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.513310580204778, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.9375, "learning_rate": 0.0009964756244598021, "loss": 0.0915, "macro_f1": 0.3333333432674408, "num_tokens": 1311314.0, "repeat_count": 0.0, "routers_loss": 0.014358235523104668, "skip_count": 0.0, "step": 830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.524232081911263, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.65625, "learning_rate": 0.0009964326958473316, "loss": 0.102, "macro_f1": 0.3272727429866791, "num_tokens": 1315495.0, "repeat_count": 0.0, "routers_loss": 0.008667540736496449, "skip_count": 0.0, "step": 832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.535153583617747, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.000996389508305327, "loss": 0.0822, "macro_f1": 0.3272727429866791, "num_tokens": 1319132.0, "repeat_count": 0.0, "routers_loss": 0.018217027187347412, "skip_count": 0.0, "step": 834, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.546075085324232, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 10.8125, "learning_rate": 0.000996346061856314, "loss": 0.2215, "macro_f1": 0.31446540355682373, "num_tokens": 1321294.0, "repeat_count": 0.0, "routers_loss": 0.1659325808286667, "skip_count": 1.0, "step": 836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.556996587030717, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.625, "learning_rate": 0.0009963023565229536, "loss": 0.1108, "macro_f1": 0.3272727429866791, "num_tokens": 1324186.0, "repeat_count": 0.0, "routers_loss": 0.11435546725988388, "skip_count": 0.0, "step": 838, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.567918088737201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.34375, "learning_rate": 0.0009962583923280419, "loss": 0.1153, "macro_f1": 0.3333333432674408, "num_tokens": 1327215.0, "repeat_count": 0.0, "routers_loss": 0.001215719268657267, "skip_count": 0.0, "step": 840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.578839590443686, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5625, "learning_rate": 0.0009962141692945092, "loss": 0.1181, "macro_f1": 0.3272727429866791, "num_tokens": 1330394.0, "repeat_count": 1.0, "routers_loss": 0.05636778846383095, "skip_count": 0.0, "step": 842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 4.58976109215017, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 5.53125, "learning_rate": 0.0009961696874454219, "loss": 0.0985, "macro_f1": 0.5934640765190125, "num_tokens": 1333840.0, "repeat_count": 0.0, "routers_loss": 0.17423874139785767, "skip_count": 2.0, "step": 844, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.600682593856655, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.375, "learning_rate": 0.0009961249468039806, "loss": 0.1442, "macro_f1": 0.3272727429866791, "num_tokens": 1337481.0, "repeat_count": 0.0, "routers_loss": 0.08344361186027527, "skip_count": 0.0, "step": 846, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 4.611604095563139, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.1875, "learning_rate": 0.0009960799473935212, "loss": 0.1287, "macro_f1": 0.29333335161209106, "num_tokens": 1340525.0, "repeat_count": 1.0, "routers_loss": 0.10816935449838638, "skip_count": 2.0, "step": 848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.622525597269624, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.703125, "learning_rate": 0.0009960346892375143, "loss": 0.1476, "macro_f1": 0.3272727429866791, "num_tokens": 1344963.0, "repeat_count": 0.0, "routers_loss": 0.02773604914546013, "skip_count": 0.0, "step": 850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.633447098976109, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.25, "learning_rate": 0.000995989172359566, "loss": 0.074, "macro_f1": 0.3144654333591461, "num_tokens": 1347911.0, "repeat_count": 0.0, "routers_loss": 0.07946910709142685, "skip_count": 3.0, "step": 852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.6443686006825935, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.5625, "learning_rate": 0.0009959433967834167, "loss": 0.0946, "macro_f1": 0.3272727429866791, "num_tokens": 1352093.0, "repeat_count": 0.0, "routers_loss": 0.20672957599163055, "skip_count": 1.0, "step": 854, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 4.6552901023890785, "f1_execute": 0.8780487775802612, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 3.109375, "learning_rate": 0.0009958973625329424, "loss": 0.1035, "macro_f1": 0.737127423286438, "num_tokens": 1355052.0, "repeat_count": 3.0, "routers_loss": 0.14273089170455933, "skip_count": 6.0, "step": 856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.6662116040955635, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.0009958510696321532, "loss": 0.1217, "macro_f1": 0.32098764181137085, "num_tokens": 1358739.0, "repeat_count": 0.0, "routers_loss": 0.03209677338600159, "skip_count": 0.0, "step": 858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.6771331058020476, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.609375, "learning_rate": 0.000995804518105195, "loss": 0.1511, "macro_f1": 0.3272727429866791, "num_tokens": 1361816.0, "repeat_count": 0.0, "routers_loss": 0.016142090782523155, "skip_count": 0.0, "step": 860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.6880546075085325, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.03125, "learning_rate": 0.0009957577079763478, "loss": 0.1588, "macro_f1": 0.3333333432674408, "num_tokens": 1365188.0, "repeat_count": 0.0, "routers_loss": 0.005357397720217705, "skip_count": 0.0, "step": 862, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.6989761092150175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.53125, "learning_rate": 0.0009957106392700272, "loss": 0.0981, "macro_f1": 0.3333333432674408, "num_tokens": 1368207.0, "repeat_count": 0.0, "routers_loss": 0.005774896126240492, "skip_count": 0.0, "step": 864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.709897610921502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.000995663312010783, "loss": 0.1432, "macro_f1": 0.3333333432674408, "num_tokens": 1370949.0, "repeat_count": 0.0, "routers_loss": 0.0034105523955076933, "skip_count": 0.0, "step": 866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.720819112627987, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.6875, "learning_rate": 0.0009956157262233003, "loss": 0.1171, "macro_f1": 0.3272727429866791, "num_tokens": 1373855.0, "repeat_count": 0.0, "routers_loss": 0.00975721050053835, "skip_count": 0.0, "step": 868, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 4.731740614334471, "f1_execute": 0.8979592323303223, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 9.8125, "learning_rate": 0.000995567881932399, "loss": 0.1658, "macro_f1": 0.4326530694961548, "num_tokens": 1376396.0, "repeat_count": 1.0, "routers_loss": 0.3017057776451111, "skip_count": 3.0, "step": 870, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.742662116040956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.5625, "learning_rate": 0.0009955197791630336, "loss": 0.141, "macro_f1": 0.3333333432674408, "num_tokens": 1379027.0, "repeat_count": 0.0, "routers_loss": 0.008239896968007088, "skip_count": 0.0, "step": 872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.753583617747441, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.0009954714179402936, "loss": 0.1144, "macro_f1": 0.3333333432674408, "num_tokens": 1382288.0, "repeat_count": 0.0, "routers_loss": 0.010364998131990433, "skip_count": 0.0, "step": 874, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.764505119453925, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.53125, "learning_rate": 0.0009954227982894035, "loss": 0.1795, "macro_f1": 0.5492662787437439, "num_tokens": 1385672.0, "repeat_count": 0.0, "routers_loss": 0.15057335793972015, "skip_count": 1.0, "step": 876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.77542662116041, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.90625, "learning_rate": 0.0009953739202357217, "loss": 0.1139, "macro_f1": 0.29333335161209106, "num_tokens": 1389206.0, "repeat_count": 1.0, "routers_loss": 0.42493173480033875, "skip_count": 3.0, "step": 878, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.786348122866894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009953247838047428, "loss": 0.1882, "macro_f1": 0.3333333432674408, "num_tokens": 1392492.0, "repeat_count": 0.0, "routers_loss": 0.005968689452856779, "skip_count": 0.0, "step": 880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.797269624573379, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.046875, "learning_rate": 0.0009952753890220948, "loss": 0.1183, "macro_f1": 0.3272727429866791, "num_tokens": 1395478.0, "repeat_count": 0.0, "routers_loss": 0.14635904133319855, "skip_count": 1.0, "step": 882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 4.808191126279864, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.34375, "learning_rate": 0.0009952257359135417, "loss": 0.1388, "macro_f1": 0.3006536066532135, "num_tokens": 1398518.0, "repeat_count": 0.0, "routers_loss": 0.1135154739022255, "skip_count": 2.0, "step": 884, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 4.819112627986348, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.65625, "learning_rate": 0.0009951758245049808, "loss": 0.179, "macro_f1": 0.5359477400779724, "num_tokens": 1401259.0, "repeat_count": 0.0, "routers_loss": 0.18914444744586945, "skip_count": 1.0, "step": 886, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.830034129692833, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.5, "learning_rate": 0.0009951256548224455, "loss": 0.0913, "macro_f1": 0.6603773832321167, "num_tokens": 1404149.0, "repeat_count": 1.0, "routers_loss": 0.04007445275783539, "skip_count": 1.0, "step": 888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.840955631399318, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.15625, "learning_rate": 0.000995075226892103, "loss": 0.129, "macro_f1": 0.32098767161369324, "num_tokens": 1406960.0, "repeat_count": 0.0, "routers_loss": 0.4282263517379761, "skip_count": 1.0, "step": 890, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5714285969734192, "avg_layers": 27.0, "epoch": 4.851877133105802, "f1_execute": 0.8999999761581421, "f1_repeat": 0.800000011920929, "f1_skip": 0.7272727489471436, "grad_norm": 5.40625, "learning_rate": 0.0009950245407402557, "loss": 0.2196, "macro_f1": 0.8090909719467163, "num_tokens": 1409634.0, "repeat_count": 2.0, "routers_loss": 0.3470841348171234, "skip_count": 7.0, "step": 892, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 4.862798634812287, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.3125, "learning_rate": 0.0009949735963933404, "loss": 0.115, "macro_f1": 0.5487528443336487, "num_tokens": 1413390.0, "repeat_count": 1.0, "routers_loss": 0.05957069247961044, "skip_count": 2.0, "step": 894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.873720136518771, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.53125, "learning_rate": 0.0009949223938779286, "loss": 0.0754, "macro_f1": 0.3333333432674408, "num_tokens": 1416605.0, "repeat_count": 0.0, "routers_loss": 0.002007940784096718, "skip_count": 0.0, "step": 896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.884641638225256, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 8.1875, "learning_rate": 0.000994870933220727, "loss": 0.1282, "macro_f1": 0.4803921580314636, "num_tokens": 1420764.0, "repeat_count": 0.0, "routers_loss": 0.08513174206018448, "skip_count": 2.0, "step": 898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.895563139931741, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.0009948192144485757, "loss": 0.0972, "macro_f1": 0.32098767161369324, "num_tokens": 1424182.0, "repeat_count": 0.0, "routers_loss": 0.03853657469153404, "skip_count": 1.0, "step": 900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 4.906484641638225, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.03125, "learning_rate": 0.0009947672375884506, "loss": 0.1737, "macro_f1": 0.6666666865348816, "num_tokens": 1426986.0, "repeat_count": 0.0, "routers_loss": 0.008192243054509163, "skip_count": 1.0, "step": 902, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 4.91740614334471, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 4.875, "learning_rate": 0.0009947150026674621, "loss": 0.0577, "macro_f1": 0.9265305995941162, "num_tokens": 1429981.0, "repeat_count": 1.0, "routers_loss": 0.06954901665449142, "skip_count": 2.0, "step": 904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.928327645051194, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.234375, "learning_rate": 0.0009946625097128543, "loss": 0.168, "macro_f1": 0.32098767161369324, "num_tokens": 1432902.0, "repeat_count": 0.0, "routers_loss": 0.0880909413099289, "skip_count": 1.0, "step": 906, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 4.939249146757679, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.9921875, "learning_rate": 0.000994609758752007, "loss": 0.1445, "macro_f1": 0.3272727429866791, "num_tokens": 1436788.0, "repeat_count": 1.0, "routers_loss": 0.5064544081687927, "skip_count": 0.0, "step": 908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 4.950170648464164, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.0625, "learning_rate": 0.0009945567498124339, "loss": 0.1658, "macro_f1": 0.5492662787437439, "num_tokens": 1439507.0, "repeat_count": 0.0, "routers_loss": 0.019065011292696, "skip_count": 2.0, "step": 910, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.961092150170648, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.40625, "learning_rate": 0.0009945034829217832, "loss": 0.0968, "macro_f1": 0.3272727429866791, "num_tokens": 1442860.0, "repeat_count": 0.0, "routers_loss": 0.018776487559080124, "skip_count": 0.0, "step": 912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 4.972013651877133, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.71875, "learning_rate": 0.0009944499581078382, "loss": 0.1252, "macro_f1": 0.3076923191547394, "num_tokens": 1446637.0, "repeat_count": 0.0, "routers_loss": 0.1531504988670349, "skip_count": 2.0, "step": 914, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 4.982935153583618, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.21875, "learning_rate": 0.000994396175398516, "loss": 0.0992, "macro_f1": 0.3144654333591461, "num_tokens": 1450238.0, "repeat_count": 0.0, "routers_loss": 0.1735955774784088, "skip_count": 0.0, "step": 916, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 4.993856655290102, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.6875, "learning_rate": 0.000994342134821869, "loss": 0.1523, "macro_f1": 0.3272727429866791, "num_tokens": 1453160.0, "repeat_count": 0.0, "routers_loss": 0.15269255638122559, "skip_count": 0.0, "step": 918, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.0, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 12.4375, "learning_rate": 0.0009942878364060837, "loss": 0.1131, "macro_f1": 0.31446540355682373, "num_tokens": 1454580.0, "repeat_count": 1.0, "routers_loss": 0.2639358341693878, "skip_count": 0.0, "step": 920, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.010921501706485, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.09375, "learning_rate": 0.0009942332801794807, "loss": 0.1702, "macro_f1": 0.6601307392120361, "num_tokens": 1457292.0, "repeat_count": 0.0, "routers_loss": 0.043732915073633194, "skip_count": 2.0, "step": 922, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 5.021843003412969, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.8125, "learning_rate": 0.000994178466170516, "loss": 0.1107, "macro_f1": 0.6538461446762085, "num_tokens": 1460434.0, "repeat_count": 1.0, "routers_loss": 0.36936479806900024, "skip_count": 1.0, "step": 924, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.032764505119454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.09375, "learning_rate": 0.0009941233944077788, "loss": 0.0547, "macro_f1": 0.6666666865348816, "num_tokens": 1463373.0, "repeat_count": 0.0, "routers_loss": 0.0019650806207209826, "skip_count": 1.0, "step": 926, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.043686006825938, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.125, "learning_rate": 0.000994068064919994, "loss": 0.0665, "macro_f1": 0.32098764181137085, "num_tokens": 1466927.0, "repeat_count": 1.0, "routers_loss": 0.06489580124616623, "skip_count": 1.0, "step": 928, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.054607508532423, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.203125, "learning_rate": 0.0009940124777360203, "loss": 0.0898, "macro_f1": 0.3272727429866791, "num_tokens": 1469834.0, "repeat_count": 0.0, "routers_loss": 0.013250669464468956, "skip_count": 0.0, "step": 930, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.065529010238908, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.421875, "learning_rate": 0.0009939566328848507, "loss": 0.0616, "macro_f1": 0.3272727429866791, "num_tokens": 1472714.0, "repeat_count": 0.0, "routers_loss": 0.03642500564455986, "skip_count": 1.0, "step": 932, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.076450511945392, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.015625, "learning_rate": 0.000993900530395613, "loss": 0.0672, "macro_f1": 0.5492662787437439, "num_tokens": 1476458.0, "repeat_count": 0.0, "routers_loss": 0.019950609654188156, "skip_count": 2.0, "step": 934, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.087372013651877, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.34375, "learning_rate": 0.0009938441702975688, "loss": 0.0714, "macro_f1": 0.5492662787437439, "num_tokens": 1479499.0, "repeat_count": 0.0, "routers_loss": 0.05769496411085129, "skip_count": 2.0, "step": 936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.098293515358362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.09375, "learning_rate": 0.000993787552620115, "loss": 0.0647, "macro_f1": 0.6666666865348816, "num_tokens": 1482112.0, "repeat_count": 0.0, "routers_loss": 0.006518410053104162, "skip_count": 2.0, "step": 938, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.109215017064846, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.359375, "learning_rate": 0.0009937306773927816, "loss": 0.0569, "macro_f1": 0.5492662787437439, "num_tokens": 1485128.0, "repeat_count": 0.0, "routers_loss": 0.16481046378612518, "skip_count": 2.0, "step": 940, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.120136518771331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.375, "learning_rate": 0.0009936735446452341, "loss": 0.0689, "macro_f1": 0.3333333432674408, "num_tokens": 1487854.0, "repeat_count": 0.0, "routers_loss": 0.00462290458381176, "skip_count": 0.0, "step": 942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.131058020477815, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.0625, "learning_rate": 0.0009936161544072716, "loss": 0.0596, "macro_f1": 0.3333333432674408, "num_tokens": 1490795.0, "repeat_count": 0.0, "routers_loss": 0.0042699906043708324, "skip_count": 0.0, "step": 944, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.1419795221843, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.78125, "learning_rate": 0.0009935585067088275, "loss": 0.1091, "macro_f1": 0.5492662787437439, "num_tokens": 1494150.0, "repeat_count": 0.0, "routers_loss": 0.01713154837489128, "skip_count": 2.0, "step": 946, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.152901023890785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.640625, "learning_rate": 0.0009935006015799703, "loss": 0.0893, "macro_f1": 0.3333333432674408, "num_tokens": 1497517.0, "repeat_count": 0.0, "routers_loss": 0.014775852672755718, "skip_count": 0.0, "step": 948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.163822525597269, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.828125, "learning_rate": 0.0009934424390509017, "loss": 0.1128, "macro_f1": 0.32098767161369324, "num_tokens": 1500944.0, "repeat_count": 0.0, "routers_loss": 0.08066675066947937, "skip_count": 1.0, "step": 950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.25, "avg_layers": 27.0, "epoch": 5.174744027303754, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.421875, "learning_rate": 0.0009933840191519584, "loss": 0.0536, "macro_f1": 0.44705885648727417, "num_tokens": 1504267.0, "repeat_count": 0.0, "routers_loss": 0.10788286477327347, "skip_count": 4.0, "step": 952, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 5.1856655290102385, "f1_execute": 0.9600000381469727, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 2.1875, "learning_rate": 0.0009933253419136107, "loss": 0.0582, "macro_f1": 0.8200000524520874, "num_tokens": 1507688.0, "repeat_count": 1.0, "routers_loss": 0.088263139128685, "skip_count": 3.0, "step": 954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000993266407366464, "loss": 0.0989, "macro_f1": 0.3333333432674408, "num_tokens": 1510658.0, "repeat_count": 0.0, "routers_loss": 0.005081284325569868, "skip_count": 0.0, "step": 956, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.59375, "learning_rate": 0.000993207215541257, "loss": 0.0562, "macro_f1": 0.5492662787437439, "num_tokens": 1515152.0, "repeat_count": 0.0, "routers_loss": 0.025190535932779312, "skip_count": 2.0, "step": 958, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.2184300341296925, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.3125, "learning_rate": 0.000993147766468863, "loss": 0.0672, "macro_f1": 0.6666666865348816, "num_tokens": 1518790.0, "repeat_count": 1.0, "routers_loss": 0.007869229651987553, "skip_count": 0.0, "step": 960, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.2293515358361775, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.609375, "learning_rate": 0.0009930880601802898, "loss": 0.0658, "macro_f1": 0.5427350401878357, "num_tokens": 1522153.0, "repeat_count": 1.0, "routers_loss": 0.15375611186027527, "skip_count": 2.0, "step": 962, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.2402730375426625, "f1_execute": 0.8444444537162781, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 5.15625, "learning_rate": 0.0009930280967066787, "loss": 0.1698, "macro_f1": 0.5481481552124023, "num_tokens": 1525054.0, "repeat_count": 3.0, "routers_loss": 0.3285106122493744, "skip_count": 4.0, "step": 964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.251194539249147, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.71875, "learning_rate": 0.0009929678760793057, "loss": 0.0853, "macro_f1": 0.4871794879436493, "num_tokens": 1528654.0, "repeat_count": 0.0, "routers_loss": 0.06668563932180405, "skip_count": 2.0, "step": 966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.262116040955632, "f1_execute": 0.9166666865348816, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.734375, "learning_rate": 0.0009929073983295804, "loss": 0.0927, "macro_f1": 0.5277777910232544, "num_tokens": 1531379.0, "repeat_count": 2.0, "routers_loss": 0.2843759059906006, "skip_count": 4.0, "step": 968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.273037542662116, "f1_execute": 0.936170220375061, "f1_repeat": 0.0, "f1_skip": 0.5714285373687744, "grad_norm": 2.265625, "learning_rate": 0.0009928466634890473, "loss": 0.0759, "macro_f1": 0.502532958984375, "num_tokens": 1534519.0, "repeat_count": 1.0, "routers_loss": 0.061425577849149704, "skip_count": 4.0, "step": 970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.283959044368601, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 3.859375, "learning_rate": 0.0009927856715893839, "loss": 0.1502, "macro_f1": 0.4871794879436493, "num_tokens": 1537641.0, "repeat_count": 0.0, "routers_loss": 0.12876227498054504, "skip_count": 2.0, "step": 972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.294880546075086, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.703125, "learning_rate": 0.0009927244226624029, "loss": 0.0589, "macro_f1": 0.4803921580314636, "num_tokens": 1540885.0, "repeat_count": 1.0, "routers_loss": 0.24013344943523407, "skip_count": 2.0, "step": 974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.30580204778157, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.578125, "learning_rate": 0.00099266291674005, "loss": 0.1553, "macro_f1": 0.6666666865348816, "num_tokens": 1545093.0, "repeat_count": 0.0, "routers_loss": 0.008588392287492752, "skip_count": 1.0, "step": 976, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.316723549488055, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.0625, "learning_rate": 0.000992601153854406, "loss": 0.0732, "macro_f1": 0.3272727429866791, "num_tokens": 1547669.0, "repeat_count": 0.0, "routers_loss": 0.1047874391078949, "skip_count": 1.0, "step": 978, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.327645051194539, "f1_execute": 0.8571428656578064, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.15625, "learning_rate": 0.000992539134037685, "loss": 0.1686, "macro_f1": 0.2857142984867096, "num_tokens": 1550684.0, "repeat_count": 1.0, "routers_loss": 0.3830685019493103, "skip_count": 2.0, "step": 980, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.338566552901024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.296875, "learning_rate": 0.0009924768573222353, "loss": 0.0979, "macro_f1": 0.3333333432674408, "num_tokens": 1553458.0, "repeat_count": 0.0, "routers_loss": 0.0034001434687525034, "skip_count": 0.0, "step": 982, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.349488054607509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.515625, "learning_rate": 0.0009924143237405392, "loss": 0.0553, "macro_f1": 0.3333333432674408, "num_tokens": 1557067.0, "repeat_count": 0.0, "routers_loss": 0.0015051440568640828, "skip_count": 0.0, "step": 984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 5.360409556313993, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.375, "learning_rate": 0.0009923515333252128, "loss": 0.0821, "macro_f1": 0.3006536066532135, "num_tokens": 1560210.0, "repeat_count": 0.0, "routers_loss": 0.38080108165740967, "skip_count": 2.0, "step": 986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.371331058020478, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 5.34375, "learning_rate": 0.0009922884861090068, "loss": 0.104, "macro_f1": 0.5359477400779724, "num_tokens": 1563164.0, "repeat_count": 1.0, "routers_loss": 0.15402451157569885, "skip_count": 1.0, "step": 988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.382252559726963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.6875, "learning_rate": 0.0009922251821248053, "loss": 0.0596, "macro_f1": 0.3333333432674408, "num_tokens": 1566178.0, "repeat_count": 0.0, "routers_loss": 0.0008378620259463787, "skip_count": 0.0, "step": 990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.393174061433447, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5, "learning_rate": 0.0009921616214056258, "loss": 0.0858, "macro_f1": 0.3272727429866791, "num_tokens": 1568705.0, "repeat_count": 0.0, "routers_loss": 0.1363816112279892, "skip_count": 1.0, "step": 992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 5.404095563139932, "f1_execute": 0.9166666865348816, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.125, "learning_rate": 0.000992097803984621, "loss": 0.0683, "macro_f1": 0.5277777910232544, "num_tokens": 1571934.0, "repeat_count": 2.0, "routers_loss": 0.15122386813163757, "skip_count": 4.0, "step": 994, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.415017064846416, "f1_execute": 0.9615384340286255, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.328125, "learning_rate": 0.0009920337298950765, "loss": 0.12, "macro_f1": 0.6538461446762085, "num_tokens": 1574947.0, "repeat_count": 1.0, "routers_loss": 0.16266369819641113, "skip_count": 1.0, "step": 996, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3203125, "learning_rate": 0.0009919693991704123, "loss": 0.0627, "macro_f1": 0.3333333432674408, "num_tokens": 1577895.0, "repeat_count": 0.0, "routers_loss": 0.002958054654300213, "skip_count": 0.0, "step": 998, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.436860068259386, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.703125, "learning_rate": 0.0009919048118441818, "loss": 0.1173, "macro_f1": 0.5492662787437439, "num_tokens": 1581513.0, "repeat_count": 0.0, "routers_loss": 0.08616811782121658, "skip_count": 2.0, "step": 1000, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 5.44778156996587, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.0009918399679500727, "loss": 0.0671, "macro_f1": 0.307692289352417, "num_tokens": 1585175.0, "repeat_count": 1.0, "routers_loss": 0.12870429456233978, "skip_count": 0.0, "step": 1002, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 5.458703071672355, "f1_execute": 0.9777777791023254, "f1_repeat": 0.0, "f1_skip": 0.888888955116272, "grad_norm": 3.328125, "learning_rate": 0.000991774867521906, "loss": 0.0944, "macro_f1": 0.6222223043441772, "num_tokens": 1588350.0, "repeat_count": 2.0, "routers_loss": 0.25614410638809204, "skip_count": 4.0, "step": 1004, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.46962457337884, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.65625, "learning_rate": 0.0009917095105936372, "loss": 0.1914, "macro_f1": 0.307692289352417, "num_tokens": 1591325.0, "repeat_count": 0.0, "routers_loss": 0.04333430901169777, "skip_count": 1.0, "step": 1006, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.480546075085324, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.25, "learning_rate": 0.000991643897199355, "loss": 0.1364, "macro_f1": 0.3272727429866791, "num_tokens": 1594383.0, "repeat_count": 0.0, "routers_loss": 0.010989947244524956, "skip_count": 0.0, "step": 1008, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.491467576791809, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.28125, "learning_rate": 0.000991578027373282, "loss": 0.0654, "macro_f1": 0.3333333432674408, "num_tokens": 1597157.0, "repeat_count": 0.0, "routers_loss": 0.0017866395646706223, "skip_count": 0.0, "step": 1010, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.502389078498293, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.484375, "learning_rate": 0.0009915119011497744, "loss": 0.0874, "macro_f1": 0.5427350401878357, "num_tokens": 1600168.0, "repeat_count": 1.0, "routers_loss": 0.12546473741531372, "skip_count": 2.0, "step": 1012, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.513310580204778, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.78125, "learning_rate": 0.0009914455185633228, "loss": 0.0695, "macro_f1": 0.5492662787437439, "num_tokens": 1604118.0, "repeat_count": 0.0, "routers_loss": 0.01084210816770792, "skip_count": 1.0, "step": 1014, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.524232081911263, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.03125, "learning_rate": 0.0009913788796485508, "loss": 0.1385, "macro_f1": 0.3333333432674408, "num_tokens": 1606640.0, "repeat_count": 0.0, "routers_loss": 0.0019855820573866367, "skip_count": 0.0, "step": 1016, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.535153583617747, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.40625, "learning_rate": 0.0009913119844402161, "loss": 0.0912, "macro_f1": 0.3333333432674408, "num_tokens": 1609241.0, "repeat_count": 0.0, "routers_loss": 0.01878403127193451, "skip_count": 0.0, "step": 1018, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.546075085324232, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.328125, "learning_rate": 0.0009912448329732098, "loss": 0.0892, "macro_f1": 0.31446540355682373, "num_tokens": 1612277.0, "repeat_count": 0.0, "routers_loss": 0.10238949954509735, "skip_count": 2.0, "step": 1020, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.556996587030717, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.1875, "learning_rate": 0.0009911774252825566, "loss": 0.1212, "macro_f1": 0.3333333432674408, "num_tokens": 1616094.0, "repeat_count": 0.0, "routers_loss": 0.005001841112971306, "skip_count": 0.0, "step": 1022, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.567918088737201, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.125, "learning_rate": 0.0009911097614034154, "loss": 0.0825, "macro_f1": 0.5492662787437439, "num_tokens": 1618451.0, "repeat_count": 0.0, "routers_loss": 0.025912249460816383, "skip_count": 1.0, "step": 1024, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.578839590443686, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.1875, "learning_rate": 0.000991041841371078, "loss": 0.0878, "macro_f1": 0.31446540355682373, "num_tokens": 1622797.0, "repeat_count": 0.0, "routers_loss": 0.057937197387218475, "skip_count": 1.0, "step": 1026, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.58976109215017, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1953125, "learning_rate": 0.0009909736652209703, "loss": 0.0597, "macro_f1": 0.3333333432674408, "num_tokens": 1625961.0, "repeat_count": 0.0, "routers_loss": 0.003770297858864069, "skip_count": 0.0, "step": 1028, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.600682593856655, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.953125, "learning_rate": 0.0009909052329886519, "loss": 0.1011, "macro_f1": 0.3272727429866791, "num_tokens": 1629268.0, "repeat_count": 0.0, "routers_loss": 0.12888562679290771, "skip_count": 1.0, "step": 1030, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.611604095563139, "f1_execute": 0.8799999952316284, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.46875, "learning_rate": 0.0009908365447098154, "loss": 0.1655, "macro_f1": 0.29333335161209106, "num_tokens": 1631820.0, "repeat_count": 1.0, "routers_loss": 0.3434430658817291, "skip_count": 3.0, "step": 1032, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.622525597269624, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.390625, "learning_rate": 0.0009907676004202873, "loss": 0.0777, "macro_f1": 0.3333333432674408, "num_tokens": 1635455.0, "repeat_count": 0.0, "routers_loss": 0.027593854814767838, "skip_count": 0.0, "step": 1034, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6000000238418579, "avg_layers": 24.0, "epoch": 5.633447098976109, "f1_execute": 0.9130434393882751, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.078125, "learning_rate": 0.0009906984001560276, "loss": 0.0718, "macro_f1": 0.5265700817108154, "num_tokens": 1638332.0, "repeat_count": 1.0, "routers_loss": 0.37137264013290405, "skip_count": 5.0, "step": 1036, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.6443686006825935, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.0009906289439531298, "loss": 0.0887, "macro_f1": 0.32098767161369324, "num_tokens": 1641686.0, "repeat_count": 1.0, "routers_loss": 0.08043651282787323, "skip_count": 0.0, "step": 1038, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 5.6552901023890785, "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 2.65625, "learning_rate": 0.000990559231847821, "loss": 0.0882, "macro_f1": 0.7644445300102234, "num_tokens": 1644729.0, "repeat_count": 2.0, "routers_loss": 0.2286449670791626, "skip_count": 2.0, "step": 1040, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.6662116040955635, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.9375, "learning_rate": 0.0009904892638764613, "loss": 0.0834, "macro_f1": 0.3333333432674408, "num_tokens": 1647815.0, "repeat_count": 0.0, "routers_loss": 0.00887050200253725, "skip_count": 0.0, "step": 1042, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.6771331058020476, "f1_execute": 0.9583333134651184, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.75, "learning_rate": 0.000990419040075545, "loss": 0.0812, "macro_f1": 0.875, "num_tokens": 1650891.0, "repeat_count": 1.0, "routers_loss": 0.04576364904642105, "skip_count": 4.0, "step": 1044, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 5.6880546075085325, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.40625, "learning_rate": 0.0009903485604816993, "loss": 0.0649, "macro_f1": 0.6666666865348816, "num_tokens": 1654705.0, "repeat_count": 0.0, "routers_loss": 0.003047809936106205, "skip_count": 2.0, "step": 1046, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.6989761092150175, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.4375, "learning_rate": 0.0009902778251316851, "loss": 0.1378, "macro_f1": 0.307692289352417, "num_tokens": 1657418.0, "repeat_count": 2.0, "routers_loss": 0.2059575766324997, "skip_count": 1.0, "step": 1048, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 5.709897610921502, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.0, "learning_rate": 0.0009902068340623964, "loss": 0.0462, "macro_f1": 1.0, "num_tokens": 1660534.0, "repeat_count": 1.0, "routers_loss": 0.004151428584009409, "skip_count": 1.0, "step": 1050, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.720819112627987, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.78125, "learning_rate": 0.000990135587310861, "loss": 0.0862, "macro_f1": 0.31446540355682373, "num_tokens": 1663186.0, "repeat_count": 0.0, "routers_loss": 0.3810015022754669, "skip_count": 2.0, "step": 1052, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.731740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.984375, "learning_rate": 0.0009900640849142394, "loss": 0.0508, "macro_f1": 0.3272727429866791, "num_tokens": 1665995.0, "repeat_count": 0.0, "routers_loss": 0.06376665085554123, "skip_count": 1.0, "step": 1054, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 5.742662116040956, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 1.6015625, "learning_rate": 0.0009899923269098261, "loss": 0.0799, "macro_f1": 0.7795917987823486, "num_tokens": 1669511.0, "repeat_count": 1.0, "routers_loss": 0.04440099373459816, "skip_count": 3.0, "step": 1056, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.753583617747441, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7421875, "learning_rate": 0.0009899203133350487, "loss": 0.046, "macro_f1": 0.3272727429866791, "num_tokens": 1672968.0, "repeat_count": 0.0, "routers_loss": 0.03650204837322235, "skip_count": 0.0, "step": 1058, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.764505119453925, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 6.9375, "learning_rate": 0.000989848044227468, "loss": 0.0694, "macro_f1": 1.0, "num_tokens": 1676646.0, "repeat_count": 1.0, "routers_loss": 0.0016751635121181607, "skip_count": 2.0, "step": 1060, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.77542662116041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.0009897755196247781, "loss": 0.1099, "macro_f1": 0.3333333432674408, "num_tokens": 1679543.0, "repeat_count": 0.0, "routers_loss": 0.0036270632408559322, "skip_count": 0.0, "step": 1062, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.786348122866894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4765625, "learning_rate": 0.0009897027395648066, "loss": 0.1327, "macro_f1": 0.3333333432674408, "num_tokens": 1683460.0, "repeat_count": 0.0, "routers_loss": 0.011792323552072048, "skip_count": 0.0, "step": 1064, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 5.797269624573379, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 4.0625, "learning_rate": 0.0009896297040855137, "loss": 0.0977, "macro_f1": 0.6603773832321167, "num_tokens": 1686347.0, "repeat_count": 0.0, "routers_loss": 0.04671615734696388, "skip_count": 1.0, "step": 1066, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.808191126279864, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0, "learning_rate": 0.0009895564132249939, "loss": 0.1001, "macro_f1": 0.3333333432674408, "num_tokens": 1689146.0, "repeat_count": 0.0, "routers_loss": 0.002473986241966486, "skip_count": 0.0, "step": 1068, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.819112627986348, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.375, "learning_rate": 0.0009894828670214738, "loss": 0.0734, "macro_f1": 0.3076923191547394, "num_tokens": 1692416.0, "repeat_count": 0.0, "routers_loss": 0.07151710987091064, "skip_count": 2.0, "step": 1070, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 5.830034129692833, "f1_execute": 0.9583333134651184, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 4.21875, "learning_rate": 0.0009894090655133136, "loss": 0.1305, "macro_f1": 0.875, "num_tokens": 1695385.0, "repeat_count": 4.0, "routers_loss": 0.27659133076667786, "skip_count": 1.0, "step": 1072, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.840955631399318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.3125, "learning_rate": 0.0009893350087390072, "loss": 0.1088, "macro_f1": 0.3333333432674408, "num_tokens": 1698705.0, "repeat_count": 0.0, "routers_loss": 0.0123160844668746, "skip_count": 0.0, "step": 1074, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.0009892606967371806, "loss": 0.1052, "macro_f1": 0.3333333432674408, "num_tokens": 1701608.0, "repeat_count": 0.0, "routers_loss": 0.007191153708845377, "skip_count": 0.0, "step": 1076, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.862798634812287, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.0009891861295465941, "loss": 0.083, "macro_f1": 0.32098764181137085, "num_tokens": 1705091.0, "repeat_count": 0.0, "routers_loss": 0.028375793248414993, "skip_count": 0.0, "step": 1078, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 5.873720136518771, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.140625, "learning_rate": 0.0009891113072061399, "loss": 0.0812, "macro_f1": 0.5492662787437439, "num_tokens": 1708151.0, "repeat_count": 0.0, "routers_loss": 0.023821692913770676, "skip_count": 2.0, "step": 1080, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.884641638225256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.8125, "learning_rate": 0.000989036229754844, "loss": 0.1053, "macro_f1": 0.3333333432674408, "num_tokens": 1712650.0, "repeat_count": 0.0, "routers_loss": 0.0011410097358748317, "skip_count": 0.0, "step": 1082, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.895563139931741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6640625, "learning_rate": 0.0009889608972318655, "loss": 0.1096, "macro_f1": 0.3333333432674408, "num_tokens": 1715711.0, "repeat_count": 0.0, "routers_loss": 0.0033964132890105247, "skip_count": 0.0, "step": 1084, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 5.906484641638225, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.234375, "learning_rate": 0.0009888853096764964, "loss": 0.0833, "macro_f1": 0.31446540355682373, "num_tokens": 1718440.0, "repeat_count": 0.0, "routers_loss": 0.21918612718582153, "skip_count": 1.0, "step": 1086, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.91740614334471, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.671875, "learning_rate": 0.0009888094671281612, "loss": 0.0551, "macro_f1": 0.3333333432674408, "num_tokens": 1722013.0, "repeat_count": 0.0, "routers_loss": 0.005385121796280146, "skip_count": 0.0, "step": 1088, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 5.928327645051194, "f1_execute": 0.95652174949646, "f1_repeat": 0.800000011920929, "f1_skip": 0.800000011920929, "grad_norm": 2.28125, "learning_rate": 0.0009887333696264188, "loss": 0.0851, "macro_f1": 0.8521739840507507, "num_tokens": 1725122.0, "repeat_count": 3.0, "routers_loss": 0.1673339158296585, "skip_count": 2.0, "step": 1090, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.939249146757679, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.71875, "learning_rate": 0.0009886570172109592, "loss": 0.063, "macro_f1": 0.6666666865348816, "num_tokens": 1727890.0, "repeat_count": 1.0, "routers_loss": 0.004812308587133884, "skip_count": 0.0, "step": 1092, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 5.950170648464164, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.609375, "learning_rate": 0.0009885804099216068, "loss": 0.1069, "macro_f1": 0.32098767161369324, "num_tokens": 1731038.0, "repeat_count": 0.0, "routers_loss": 0.07385388761758804, "skip_count": 1.0, "step": 1094, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 23.0, "epoch": 5.961092150170648, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.5714285969734192, "grad_norm": 4.34375, "learning_rate": 0.0009885035477983184, "loss": 0.1058, "macro_f1": 0.5034013986587524, "num_tokens": 1734527.0, "repeat_count": 0.0, "routers_loss": 0.038657210767269135, "skip_count": 2.0, "step": 1096, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 5.972013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.53125, "learning_rate": 0.0009884264308811837, "loss": 0.0574, "macro_f1": 0.3333333432674408, "num_tokens": 1736994.0, "repeat_count": 0.0, "routers_loss": 0.002709791297093034, "skip_count": 0.0, "step": 1098, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 5.982935153583618, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.28125, "learning_rate": 0.0009883490592104253, "loss": 0.074, "macro_f1": 0.6603773832321167, "num_tokens": 1740072.0, "repeat_count": 0.0, "routers_loss": 0.03136845678091049, "skip_count": 1.0, "step": 1100, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 5.993856655290102, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.0, "learning_rate": 0.000988271432826399, "loss": 0.1018, "macro_f1": 0.6666666865348816, "num_tokens": 1743666.0, "repeat_count": 1.0, "routers_loss": 0.0008747070096433163, "skip_count": 0.0, "step": 1102, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.0, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 24.125, "learning_rate": 0.0009881935517695932, "loss": 0.1521, "macro_f1": 0.31446540355682373, "num_tokens": 1745496.0, "repeat_count": 0.0, "routers_loss": 0.38018545508384705, "skip_count": 1.0, "step": 1104, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.010921501706485, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.71875, "learning_rate": 0.0009881154160806286, "loss": 0.0969, "macro_f1": 0.3272727429866791, "num_tokens": 1748331.0, "repeat_count": 0.0, "routers_loss": 0.013498244807124138, "skip_count": 0.0, "step": 1106, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.021843003412969, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.265625, "learning_rate": 0.00098803702580026, "loss": 0.0445, "macro_f1": 0.3333333432674408, "num_tokens": 1751426.0, "repeat_count": 0.0, "routers_loss": 0.007809123490005732, "skip_count": 0.0, "step": 1108, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.032764505119454, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7578125, "learning_rate": 0.0009879583809693738, "loss": 0.0598, "macro_f1": 0.3333333432674408, "num_tokens": 1754387.0, "repeat_count": 0.0, "routers_loss": 0.005468994844704866, "skip_count": 0.0, "step": 1110, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.043686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.28125, "learning_rate": 0.0009878794816289894, "loss": 0.0374, "macro_f1": 0.3333333432674408, "num_tokens": 1756963.0, "repeat_count": 0.0, "routers_loss": 0.00034040669561363757, "skip_count": 0.0, "step": 1112, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.054607508532423, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.96875, "learning_rate": 0.0009878003278202597, "loss": 0.07, "macro_f1": 0.3333333432674408, "num_tokens": 1760506.0, "repeat_count": 0.0, "routers_loss": 0.005837638862431049, "skip_count": 0.0, "step": 1114, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.065529010238908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.15625, "learning_rate": 0.0009877209195844692, "loss": 0.0481, "macro_f1": 0.3333333432674408, "num_tokens": 1763181.0, "repeat_count": 0.0, "routers_loss": 0.0015226757386699319, "skip_count": 0.0, "step": 1116, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.076450511945392, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.03125, "learning_rate": 0.000987641256963036, "loss": 0.0813, "macro_f1": 0.6666666865348816, "num_tokens": 1766239.0, "repeat_count": 0.0, "routers_loss": 0.003985739313066006, "skip_count": 1.0, "step": 1118, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 6.087372013651877, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 2.234375, "learning_rate": 0.0009875613399975105, "loss": 0.0579, "macro_f1": 0.9265305995941162, "num_tokens": 1769355.0, "repeat_count": 1.0, "routers_loss": 0.07862874120473862, "skip_count": 3.0, "step": 1120, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.098293515358362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.8515625, "learning_rate": 0.0009874811687295758, "loss": 0.0363, "macro_f1": 0.6666666865348816, "num_tokens": 1773129.0, "repeat_count": 0.0, "routers_loss": 0.01232099998742342, "skip_count": 2.0, "step": 1122, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.109215017064846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.578125, "learning_rate": 0.0009874007432010476, "loss": 0.044, "macro_f1": 0.3333333432674408, "num_tokens": 1775676.0, "repeat_count": 0.0, "routers_loss": 0.0049852593801915646, "skip_count": 0.0, "step": 1124, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.120136518771331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4375, "learning_rate": 0.0009873200634538746, "loss": 0.1125, "macro_f1": 0.3333333432674408, "num_tokens": 1779385.0, "repeat_count": 0.0, "routers_loss": 0.002998467069119215, "skip_count": 0.0, "step": 1126, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.131058020477815, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5, "learning_rate": 0.0009872391295301373, "loss": 0.0537, "macro_f1": 0.3272727429866791, "num_tokens": 1782879.0, "repeat_count": 0.0, "routers_loss": 0.024785716086626053, "skip_count": 0.0, "step": 1128, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 6.1419795221843, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.40625, "learning_rate": 0.0009871579414720494, "loss": 0.049, "macro_f1": 0.6598639488220215, "num_tokens": 1786176.0, "repeat_count": 1.0, "routers_loss": 0.15303805470466614, "skip_count": 3.0, "step": 1130, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.152901023890785, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.1796875, "learning_rate": 0.0009870764993219574, "loss": 0.0295, "macro_f1": 0.6666666865348816, "num_tokens": 1789607.0, "repeat_count": 0.0, "routers_loss": 0.007586095482110977, "skip_count": 2.0, "step": 1132, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.6666666865348816, "avg_layers": 30.0, "epoch": 6.163822525597269, "f1_execute": 0.9090909361839294, "f1_repeat": 0.5714285373687744, "f1_skip": 0.800000011920929, "grad_norm": 7.15625, "learning_rate": 0.0009869948031223392, "loss": 0.067, "macro_f1": 0.7601732015609741, "num_tokens": 1792453.0, "repeat_count": 3.0, "routers_loss": 0.3679158389568329, "skip_count": 3.0, "step": 1134, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.174744027303754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.484375, "learning_rate": 0.0009869128529158065, "loss": 0.1633, "macro_f1": 0.3333333432674408, "num_tokens": 1795984.0, "repeat_count": 0.0, "routers_loss": 0.008049221709370613, "skip_count": 0.0, "step": 1136, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.1856655290102385, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.453125, "learning_rate": 0.0009868306487451027, "loss": 0.0517, "macro_f1": 0.3333333432674408, "num_tokens": 1799460.0, "repeat_count": 0.0, "routers_loss": 0.006065836176276207, "skip_count": 0.0, "step": 1138, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.1965870307167235, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.1640625, "learning_rate": 0.0009867481906531037, "loss": 0.0355, "macro_f1": 0.6666666865348816, "num_tokens": 1802670.0, "repeat_count": 0.0, "routers_loss": 0.00862112082540989, "skip_count": 1.0, "step": 1140, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.207508532423208, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.75, "learning_rate": 0.0009866654786828185, "loss": 0.0917, "macro_f1": 0.3270440399646759, "num_tokens": 1805600.0, "repeat_count": 1.0, "routers_loss": 0.2556481659412384, "skip_count": 0.0, "step": 1142, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.2184300341296925, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 2.65625, "learning_rate": 0.0009865825128773874, "loss": 0.0547, "macro_f1": 0.5492662787437439, "num_tokens": 1808530.0, "repeat_count": 2.0, "routers_loss": 0.034160859882831573, "skip_count": 0.0, "step": 1144, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.2293515358361775, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2578125, "learning_rate": 0.0009864992932800845, "loss": 0.0355, "macro_f1": 0.3272727429866791, "num_tokens": 1811991.0, "repeat_count": 1.0, "routers_loss": 0.01372818648815155, "skip_count": 0.0, "step": 1146, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.2402730375426625, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.625, "learning_rate": 0.000986415819934315, "loss": 0.09, "macro_f1": 0.6666666865348816, "num_tokens": 1815213.0, "repeat_count": 1.0, "routers_loss": 0.001231279456987977, "skip_count": 0.0, "step": 1148, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.251194539249147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.828125, "learning_rate": 0.000986332092883617, "loss": 0.1053, "macro_f1": 0.3333333432674408, "num_tokens": 1819066.0, "repeat_count": 0.0, "routers_loss": 0.0006753680645488203, "skip_count": 0.0, "step": 1150, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.262116040955632, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.3125, "learning_rate": 0.000986248112171661, "loss": 0.1026, "macro_f1": 0.5492662787437439, "num_tokens": 1821892.0, "repeat_count": 0.0, "routers_loss": 0.09902634471654892, "skip_count": 2.0, "step": 1152, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 6.273037542662116, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.6875, "learning_rate": 0.0009861638778422494, "loss": 0.1133, "macro_f1": 0.4871794879436493, "num_tokens": 1825546.0, "repeat_count": 0.0, "routers_loss": 0.1018824502825737, "skip_count": 2.0, "step": 1154, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.283959044368601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.09375, "learning_rate": 0.0009860793899393178, "loss": 0.0572, "macro_f1": 0.3333333432674408, "num_tokens": 1829001.0, "repeat_count": 0.0, "routers_loss": 0.007587054278701544, "skip_count": 0.0, "step": 1156, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 6.294880546075086, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.25, "learning_rate": 0.0009859946485069326, "loss": 0.1146, "macro_f1": 0.480392187833786, "num_tokens": 1831872.0, "repeat_count": 1.0, "routers_loss": 0.24491754174232483, "skip_count": 3.0, "step": 1158, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 6.30580204778157, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.453125, "learning_rate": 0.0009859096535892939, "loss": 0.0664, "macro_f1": 0.4871794879436493, "num_tokens": 1834785.0, "repeat_count": 0.0, "routers_loss": 0.23575152456760406, "skip_count": 2.0, "step": 1160, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.316723549488055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.359375, "learning_rate": 0.000985824405230733, "loss": 0.0743, "macro_f1": 0.3333333432674408, "num_tokens": 1837557.0, "repeat_count": 0.0, "routers_loss": 0.011565971188247204, "skip_count": 0.0, "step": 1162, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.327645051194539, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 5.09375, "learning_rate": 0.000985738903475714, "loss": 0.1202, "macro_f1": 0.6601307392120361, "num_tokens": 1841258.0, "repeat_count": 1.0, "routers_loss": 0.1322765052318573, "skip_count": 2.0, "step": 1164, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.338566552901024, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.53125, "learning_rate": 0.000985653148368833, "loss": 0.0753, "macro_f1": 0.32098764181137085, "num_tokens": 1844015.0, "repeat_count": 0.0, "routers_loss": 0.09913542866706848, "skip_count": 2.0, "step": 1166, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.349488054607509, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.703125, "learning_rate": 0.000985567139954818, "loss": 0.0975, "macro_f1": 0.3272727429866791, "num_tokens": 1847500.0, "repeat_count": 1.0, "routers_loss": 0.32083266973495483, "skip_count": 0.0, "step": 1168, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.360409556313993, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.53125, "learning_rate": 0.0009854808782785295, "loss": 0.0483, "macro_f1": 0.3333333432674408, "num_tokens": 1849952.0, "repeat_count": 0.0, "routers_loss": 0.004351979121565819, "skip_count": 0.0, "step": 1170, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.371331058020478, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.75, "learning_rate": 0.0009853943633849594, "loss": 0.0973, "macro_f1": 0.31446540355682373, "num_tokens": 1852818.0, "repeat_count": 1.0, "routers_loss": 0.3433290123939514, "skip_count": 1.0, "step": 1172, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.382252559726963, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.65625, "learning_rate": 0.0009853075953192326, "loss": 0.0806, "macro_f1": 0.5492662787437439, "num_tokens": 1855763.0, "repeat_count": 0.0, "routers_loss": 0.014246528968214989, "skip_count": 2.0, "step": 1174, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.393174061433447, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.734375, "learning_rate": 0.0009852205741266056, "loss": 0.0786, "macro_f1": 0.32098764181137085, "num_tokens": 1858697.0, "repeat_count": 0.0, "routers_loss": 0.021391790360212326, "skip_count": 0.0, "step": 1176, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.404095563139932, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.796875, "learning_rate": 0.000985133299852467, "loss": 0.0607, "macro_f1": 0.3333333432674408, "num_tokens": 1861428.0, "repeat_count": 0.0, "routers_loss": 0.001103987917304039, "skip_count": 0.0, "step": 1178, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 6.415017064846416, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 10.4375, "learning_rate": 0.000985045772542337, "loss": 0.1274, "macro_f1": 0.9265305995941162, "num_tokens": 1864502.0, "repeat_count": 1.0, "routers_loss": 0.04696040600538254, "skip_count": 3.0, "step": 1180, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.59375, "learning_rate": 0.0009849579922418686, "loss": 0.0311, "macro_f1": 0.3333333432674408, "num_tokens": 1869038.0, "repeat_count": 0.0, "routers_loss": 0.006302578374743462, "skip_count": 0.0, "step": 1182, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.436860068259386, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.15625, "learning_rate": 0.0009848699589968457, "loss": 0.0732, "macro_f1": 0.6601307392120361, "num_tokens": 1872012.0, "repeat_count": 1.0, "routers_loss": 0.07310613244771957, "skip_count": 2.0, "step": 1184, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.44778156996587, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 5.90625, "learning_rate": 0.0009847816728531852, "loss": 0.1115, "macro_f1": 0.3272727429866791, "num_tokens": 1875285.0, "repeat_count": 0.0, "routers_loss": 0.006697377189993858, "skip_count": 0.0, "step": 1186, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 6.458703071672355, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 2.71875, "learning_rate": 0.0009846931338569351, "loss": 0.0991, "macro_f1": 0.5866667032241821, "num_tokens": 1878316.0, "repeat_count": 1.0, "routers_loss": 0.05192936211824417, "skip_count": 3.0, "step": 1188, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.46962457337884, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.140625, "learning_rate": 0.000984604342054276, "loss": 0.0743, "macro_f1": 0.3272727429866791, "num_tokens": 1881368.0, "repeat_count": 1.0, "routers_loss": 0.6876614093780518, "skip_count": 0.0, "step": 1190, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.480546075085324, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5703125, "learning_rate": 0.0009845152974915195, "loss": 0.0558, "macro_f1": 0.31446540355682373, "num_tokens": 1884812.0, "repeat_count": 0.0, "routers_loss": 0.044164709746837616, "skip_count": 2.0, "step": 1192, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.491467576791809, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.359375, "learning_rate": 0.0009844260002151097, "loss": 0.0523, "macro_f1": 0.32098764181137085, "num_tokens": 1887824.0, "repeat_count": 0.0, "routers_loss": 0.21223463118076324, "skip_count": 2.0, "step": 1194, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.502389078498293, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.484375, "learning_rate": 0.0009843364502716224, "loss": 0.0384, "macro_f1": 0.32098764181137085, "num_tokens": 1891796.0, "repeat_count": 0.0, "routers_loss": 0.02757224440574646, "skip_count": 0.0, "step": 1196, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.513310580204778, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.4375, "learning_rate": 0.0009842466477077648, "loss": 0.0692, "macro_f1": 0.32098764181137085, "num_tokens": 1895433.0, "repeat_count": 0.0, "routers_loss": 0.21957652270793915, "skip_count": 0.0, "step": 1198, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.524232081911263, "f1_execute": 0.9411764740943909, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.34375, "learning_rate": 0.0009841565925703766, "loss": 0.0538, "macro_f1": 0.6470588445663452, "num_tokens": 1899036.0, "repeat_count": 1.0, "routers_loss": 0.2702154517173767, "skip_count": 2.0, "step": 1200, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.535153583617747, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.578125, "learning_rate": 0.0009840662849064283, "loss": 0.0766, "macro_f1": 0.5492662787437439, "num_tokens": 1902046.0, "repeat_count": 0.0, "routers_loss": 0.017532862722873688, "skip_count": 1.0, "step": 1202, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.546075085324232, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.265625, "learning_rate": 0.000983975724763023, "loss": 0.0627, "macro_f1": 0.3333333432674408, "num_tokens": 1906017.0, "repeat_count": 0.0, "routers_loss": 0.006354475859552622, "skip_count": 0.0, "step": 1204, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.556996587030717, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.078125, "learning_rate": 0.000983884912187395, "loss": 0.0955, "macro_f1": 0.3333333432674408, "num_tokens": 1908813.0, "repeat_count": 0.0, "routers_loss": 0.0036974933464080095, "skip_count": 0.0, "step": 1206, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.567918088737201, "f1_execute": 0.9019607901573181, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.00098379384722691, "loss": 0.0968, "macro_f1": 0.3006536066532135, "num_tokens": 1911626.0, "repeat_count": 3.0, "routers_loss": 0.9854266047477722, "skip_count": 0.0, "step": 1208, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.578839590443686, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.875, "learning_rate": 0.0009837025299290663, "loss": 0.1186, "macro_f1": 0.30666667222976685, "num_tokens": 1913970.0, "repeat_count": 1.0, "routers_loss": 0.8648518323898315, "skip_count": 4.0, "step": 1210, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.58976109215017, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.9375, "learning_rate": 0.0009836109603414925, "loss": 0.0896, "macro_f1": 0.3272727429866791, "num_tokens": 1917093.0, "repeat_count": 0.0, "routers_loss": 0.01036280207335949, "skip_count": 0.0, "step": 1212, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.600682593856655, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.125, "learning_rate": 0.0009835191385119501, "loss": 0.1323, "macro_f1": 0.3333333432674408, "num_tokens": 1919812.0, "repeat_count": 0.0, "routers_loss": 0.0218805018812418, "skip_count": 0.0, "step": 1214, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.611604095563139, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.28125, "learning_rate": 0.0009834270644883312, "loss": 0.1347, "macro_f1": 0.32098764181137085, "num_tokens": 1922686.0, "repeat_count": 0.0, "routers_loss": 0.1494085192680359, "skip_count": 0.0, "step": 1216, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.622525597269624, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.734375, "learning_rate": 0.00098333473831866, "loss": 0.0762, "macro_f1": 0.4803921580314636, "num_tokens": 1927138.0, "repeat_count": 0.0, "routers_loss": 0.10250654816627502, "skip_count": 2.0, "step": 1218, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.633447098976109, "f1_execute": 0.936170220375061, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 2.828125, "learning_rate": 0.000983242160051092, "loss": 0.0719, "macro_f1": 0.7565011978149414, "num_tokens": 1929710.0, "repeat_count": 2.0, "routers_loss": 0.1922699362039566, "skip_count": 4.0, "step": 1220, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.6443686006825935, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.078125, "learning_rate": 0.0009831493297339142, "loss": 0.0931, "macro_f1": 0.32098764181137085, "num_tokens": 1933318.0, "repeat_count": 0.0, "routers_loss": 0.022764762863516808, "skip_count": 2.0, "step": 1222, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.6552901023890785, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.859375, "learning_rate": 0.0009830562474155448, "loss": 0.0711, "macro_f1": 0.5492662787437439, "num_tokens": 1936425.0, "repeat_count": 0.0, "routers_loss": 0.0401819609105587, "skip_count": 2.0, "step": 1224, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.6662116040955635, "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 3.109375, "learning_rate": 0.0009829629131445341, "loss": 0.1095, "macro_f1": 0.8200000524520874, "num_tokens": 1939397.0, "repeat_count": 1.0, "routers_loss": 0.1785280406475067, "skip_count": 2.0, "step": 1226, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.6771331058020476, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.9296875, "learning_rate": 0.0009828693269695631, "loss": 0.0588, "macro_f1": 0.3272727429866791, "num_tokens": 1942988.0, "repeat_count": 0.0, "routers_loss": 0.02261308766901493, "skip_count": 0.0, "step": 1228, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.6880546075085325, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.078125, "learning_rate": 0.0009827754889394449, "loss": 0.0485, "macro_f1": 0.3272727429866791, "num_tokens": 1946582.0, "repeat_count": 0.0, "routers_loss": 0.012790258973836899, "skip_count": 0.0, "step": 1230, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 6.6989761092150175, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.84375, "learning_rate": 0.0009826813991031232, "loss": 0.0754, "macro_f1": 0.480392187833786, "num_tokens": 1949898.0, "repeat_count": 1.0, "routers_loss": 0.11266407370567322, "skip_count": 3.0, "step": 1232, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 6.709897610921502, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.296875, "learning_rate": 0.0009825870575096735, "loss": 0.1001, "macro_f1": 0.4871794879436493, "num_tokens": 1953055.0, "repeat_count": 0.0, "routers_loss": 0.060697004199028015, "skip_count": 2.0, "step": 1234, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.720819112627987, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.78125, "learning_rate": 0.0009824924642083026, "loss": 0.0933, "macro_f1": 0.31446540355682373, "num_tokens": 1956453.0, "repeat_count": 0.0, "routers_loss": 0.10489041358232498, "skip_count": 1.0, "step": 1236, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.731740614334471, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.546875, "learning_rate": 0.0009823976192483487, "loss": 0.1062, "macro_f1": 0.5492662787437439, "num_tokens": 1959389.0, "repeat_count": 0.0, "routers_loss": 0.024421939626336098, "skip_count": 2.0, "step": 1238, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 22.0, "epoch": 6.742662116040956, "f1_execute": 0.9090909361839294, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.734375, "learning_rate": 0.0009823025226792805, "loss": 0.0857, "macro_f1": 0.5252525806427002, "num_tokens": 1962998.0, "repeat_count": 0.0, "routers_loss": 0.08728008717298508, "skip_count": 6.0, "step": 1240, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.753583617747441, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8984375, "learning_rate": 0.000982207174550699, "loss": 0.0895, "macro_f1": 0.3333333432674408, "num_tokens": 1965919.0, "repeat_count": 0.0, "routers_loss": 0.0014093272620812058, "skip_count": 0.0, "step": 1242, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.764505119453925, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.3125, "learning_rate": 0.0009821115749123355, "loss": 0.0922, "macro_f1": 0.3333333432674408, "num_tokens": 1969442.0, "repeat_count": 0.0, "routers_loss": 0.011364223435521126, "skip_count": 0.0, "step": 1244, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.77542662116041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.703125, "learning_rate": 0.0009820157238140535, "loss": 0.1043, "macro_f1": 0.3333333432674408, "num_tokens": 1971840.0, "repeat_count": 0.0, "routers_loss": 0.011595879681408405, "skip_count": 0.0, "step": 1246, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 6.786348122866894, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 4.53125, "learning_rate": 0.0009819196213058463, "loss": 0.0974, "macro_f1": 0.6601307392120361, "num_tokens": 1975352.0, "repeat_count": 0.0, "routers_loss": 0.01998947374522686, "skip_count": 2.0, "step": 1248, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.797269624573379, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.3125, "learning_rate": 0.0009818232674378398, "loss": 0.0741, "macro_f1": 0.3333333432674408, "num_tokens": 1978040.0, "repeat_count": 0.0, "routers_loss": 0.0056211939081549644, "skip_count": 0.0, "step": 1250, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 6.808191126279864, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.71875, "learning_rate": 0.0009817266622602898, "loss": 0.1004, "macro_f1": 0.5492662787437439, "num_tokens": 1980721.0, "repeat_count": 0.0, "routers_loss": 0.018358899280428886, "skip_count": 1.0, "step": 1252, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.819112627986348, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.0009816298058235838, "loss": 0.1247, "macro_f1": 0.32098767161369324, "num_tokens": 1983304.0, "repeat_count": 1.0, "routers_loss": 0.07594861835241318, "skip_count": 0.0, "step": 1254, "text_loss": 0.0 }, { "acc_repeat": 0.75, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 6.830034129692833, "f1_execute": 0.9583333134651184, "f1_repeat": 0.8571428656578064, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.0009815326981782403, "loss": 0.0733, "macro_f1": 0.6051587462425232, "num_tokens": 1986129.0, "repeat_count": 4.0, "routers_loss": 0.06383349001407623, "skip_count": 0.0, "step": 1256, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 6.840955631399318, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 6.71875, "learning_rate": 0.0009814353393749085, "loss": 0.0936, "macro_f1": 0.6603773832321167, "num_tokens": 1989969.0, "repeat_count": 0.0, "routers_loss": 0.07547000050544739, "skip_count": 1.0, "step": 1258, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.53125, "learning_rate": 0.000981337729464369, "loss": 0.1036, "macro_f1": 0.3333333432674408, "num_tokens": 1993593.0, "repeat_count": 0.0, "routers_loss": 0.0030110280495136976, "skip_count": 0.0, "step": 1260, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.862798634812287, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.28125, "learning_rate": 0.0009812398684975333, "loss": 0.0928, "macro_f1": 0.3272727429866791, "num_tokens": 1996793.0, "repeat_count": 0.0, "routers_loss": 0.01446655672043562, "skip_count": 1.0, "step": 1262, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.873720136518771, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.140625, "learning_rate": 0.0009811417565254437, "loss": 0.0767, "macro_f1": 0.3333333432674408, "num_tokens": 1999674.0, "repeat_count": 0.0, "routers_loss": 0.0017486799042671919, "skip_count": 0.0, "step": 1264, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.884641638225256, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.359375, "learning_rate": 0.0009810433935992732, "loss": 0.0816, "macro_f1": 0.3272727429866791, "num_tokens": 2002868.0, "repeat_count": 0.0, "routers_loss": 0.11616934835910797, "skip_count": 1.0, "step": 1266, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 6.895563139931741, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.828125, "learning_rate": 0.0009809447797703263, "loss": 0.0517, "macro_f1": 0.4871794879436493, "num_tokens": 2005920.0, "repeat_count": 0.0, "routers_loss": 0.02944372035562992, "skip_count": 2.0, "step": 1268, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.906484641638225, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8984375, "learning_rate": 0.0009808459150900382, "loss": 0.0571, "macro_f1": 0.3272727429866791, "num_tokens": 2008784.0, "repeat_count": 0.0, "routers_loss": 0.013198727741837502, "skip_count": 0.0, "step": 1270, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 6.91740614334471, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.078125, "learning_rate": 0.0009807467996099742, "loss": 0.0749, "macro_f1": 0.3272727429866791, "num_tokens": 2011565.0, "repeat_count": 0.0, "routers_loss": 0.033459246158599854, "skip_count": 0.0, "step": 1272, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.6000000238418579, "avg_layers": 25.0, "epoch": 6.928327645051194, "f1_execute": 0.878048837184906, "f1_repeat": 0.800000011920929, "f1_skip": 0.6000000238418579, "grad_norm": 4.96875, "learning_rate": 0.0009806474333818316, "loss": 0.1451, "macro_f1": 0.7593497037887573, "num_tokens": 2014588.0, "repeat_count": 3.0, "routers_loss": 0.5193678140640259, "skip_count": 5.0, "step": 1274, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 6.939249146757679, "f1_execute": 0.8979591727256775, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 7.3125, "learning_rate": 0.0009805478164574373, "loss": 0.1787, "macro_f1": 0.4326530694961548, "num_tokens": 2017582.0, "repeat_count": 0.0, "routers_loss": 0.10298392176628113, "skip_count": 2.0, "step": 1276, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.950170648464164, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.359375, "learning_rate": 0.0009804479488887499, "loss": 0.0606, "macro_f1": 0.3076923191547394, "num_tokens": 2021554.0, "repeat_count": 0.0, "routers_loss": 0.26711538434028625, "skip_count": 2.0, "step": 1278, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 6.961092150170648, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.359375, "learning_rate": 0.0009803478307278582, "loss": 0.0656, "macro_f1": 0.8820862174034119, "num_tokens": 2024619.0, "repeat_count": 2.0, "routers_loss": 0.10440108180046082, "skip_count": 2.0, "step": 1280, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 6.972013651877133, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.90625, "learning_rate": 0.000980247462026982, "loss": 0.1014, "macro_f1": 0.6603773832321167, "num_tokens": 2027642.0, "repeat_count": 1.0, "routers_loss": 0.04668421298265457, "skip_count": 1.0, "step": 1282, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 6.982935153583618, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.0625, "learning_rate": 0.0009801468428384716, "loss": 0.0877, "macro_f1": 0.32098764181137085, "num_tokens": 2030696.0, "repeat_count": 0.0, "routers_loss": 0.04770398512482643, "skip_count": 0.0, "step": 1284, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 6.993856655290102, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8203125, "learning_rate": 0.000980045973214808, "loss": 0.061, "macro_f1": 0.3272727429866791, "num_tokens": 2034805.0, "repeat_count": 1.0, "routers_loss": 0.1269090622663498, "skip_count": 0.0, "step": 1286, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.0, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.34375, "learning_rate": 0.0009799448532086027, "loss": 0.0504, "macro_f1": 0.6538461446762085, "num_tokens": 2036412.0, "repeat_count": 1.0, "routers_loss": 0.05963828042149544, "skip_count": 1.0, "step": 1288, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.010921501706485, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 6.40625, "learning_rate": 0.0009798434828725978, "loss": 0.0753, "macro_f1": 0.3272727429866791, "num_tokens": 2039824.0, "repeat_count": 0.0, "routers_loss": 0.027600891888141632, "skip_count": 0.0, "step": 1290, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.021843003412969, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.015625, "learning_rate": 0.0009797418622596663, "loss": 0.0801, "macro_f1": 0.6666666865348816, "num_tokens": 2042860.0, "repeat_count": 0.0, "routers_loss": 0.00797218643128872, "skip_count": 1.0, "step": 1292, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.032764505119454, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.7578125, "learning_rate": 0.0009796399914228115, "loss": 0.0504, "macro_f1": 0.5492662787437439, "num_tokens": 2046324.0, "repeat_count": 0.0, "routers_loss": 0.05377062410116196, "skip_count": 2.0, "step": 1294, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.043686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.6875, "learning_rate": 0.0009795378704151674, "loss": 0.0383, "macro_f1": 0.3333333432674408, "num_tokens": 2049367.0, "repeat_count": 0.0, "routers_loss": 0.0009546718210913241, "skip_count": 0.0, "step": 1296, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 7.054607508532423, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 6.21875, "learning_rate": 0.0009794354992899979, "loss": 0.0667, "macro_f1": 0.4533333480358124, "num_tokens": 2052338.0, "repeat_count": 0.0, "routers_loss": 0.08195896446704865, "skip_count": 2.0, "step": 1298, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.065529010238908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.78125, "learning_rate": 0.0009793328781006981, "loss": 0.0847, "macro_f1": 0.3333333432674408, "num_tokens": 2055012.0, "repeat_count": 0.0, "routers_loss": 0.016476945951581, "skip_count": 0.0, "step": 1300, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.076450511945392, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.40625, "learning_rate": 0.0009792300069007929, "loss": 0.0585, "macro_f1": 0.32098764181137085, "num_tokens": 2057860.0, "repeat_count": 0.0, "routers_loss": 0.03864477947354317, "skip_count": 0.0, "step": 1302, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 7.087372013651877, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.421875, "learning_rate": 0.0009791268857439381, "loss": 0.0681, "macro_f1": 0.4871794879436493, "num_tokens": 2061796.0, "repeat_count": 0.0, "routers_loss": 0.09663038700819016, "skip_count": 2.0, "step": 1304, "text_loss": 0.0 }, { "acc_repeat": 0.3333333432674408, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 7.098293515358362, "f1_execute": 0.9130434393882751, "f1_repeat": 0.5, "f1_skip": 0.6666666865348816, "grad_norm": 1.4609375, "learning_rate": 0.0009790235146839197, "loss": 0.0277, "macro_f1": 0.693236768245697, "num_tokens": 2065048.0, "repeat_count": 3.0, "routers_loss": 0.1414264291524887, "skip_count": 3.0, "step": 1306, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.109215017064846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1953125, "learning_rate": 0.0009789198937746536, "loss": 0.0343, "macro_f1": 0.3333333432674408, "num_tokens": 2068513.0, "repeat_count": 0.0, "routers_loss": 0.005623117554932833, "skip_count": 0.0, "step": 1308, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.120136518771331, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.890625, "learning_rate": 0.0009788160230701872, "loss": 0.05, "macro_f1": 0.3333333432674408, "num_tokens": 2072162.0, "repeat_count": 0.0, "routers_loss": 0.0010799479205161333, "skip_count": 0.0, "step": 1310, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.131058020477815, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.015625, "learning_rate": 0.000978711902624697, "loss": 0.0477, "macro_f1": 0.3272727429866791, "num_tokens": 2075493.0, "repeat_count": 0.0, "routers_loss": 0.013055410236120224, "skip_count": 1.0, "step": 1312, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 7.1419795221843, "f1_execute": 0.978723406791687, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.015625, "learning_rate": 0.0009786075324924898, "loss": 0.0918, "macro_f1": 0.6595745086669922, "num_tokens": 2078419.0, "repeat_count": 1.0, "routers_loss": 0.04582887142896652, "skip_count": 4.0, "step": 1314, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 7.152901023890785, "f1_execute": 0.9166666269302368, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 5.3125, "learning_rate": 0.0009785029127280037, "loss": 0.1153, "macro_f1": 0.5722222328186035, "num_tokens": 2081402.0, "repeat_count": 2.0, "routers_loss": 0.4450821280479431, "skip_count": 3.0, "step": 1316, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.163822525597269, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.875, "learning_rate": 0.0009783980433858057, "loss": 0.1022, "macro_f1": 0.3333333432674408, "num_tokens": 2084281.0, "repeat_count": 0.0, "routers_loss": 0.0024625554215162992, "skip_count": 0.0, "step": 1318, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.174744027303754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.171875, "learning_rate": 0.000978292924520594, "loss": 0.0451, "macro_f1": 0.3333333432674408, "num_tokens": 2086989.0, "repeat_count": 0.0, "routers_loss": 0.0038409712724387646, "skip_count": 0.0, "step": 1320, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.1856655290102385, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.21875, "learning_rate": 0.0009781875561871965, "loss": 0.0307, "macro_f1": 0.5492662787437439, "num_tokens": 2090237.0, "repeat_count": 0.0, "routers_loss": 0.16634200513362885, "skip_count": 2.0, "step": 1322, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.1965870307167235, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.125, "learning_rate": 0.0009780819384405711, "loss": 0.0775, "macro_f1": 0.5492662787437439, "num_tokens": 2093942.0, "repeat_count": 0.0, "routers_loss": 0.21338669955730438, "skip_count": 2.0, "step": 1324, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.207508532423208, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.40625, "learning_rate": 0.000977976071335806, "loss": 0.0528, "macro_f1": 0.3333333432674408, "num_tokens": 2097090.0, "repeat_count": 0.0, "routers_loss": 0.0032144824508577585, "skip_count": 0.0, "step": 1326, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.2184300341296925, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.84375, "learning_rate": 0.0009778699549281191, "loss": 0.0754, "macro_f1": 0.3333333432674408, "num_tokens": 2100108.0, "repeat_count": 0.0, "routers_loss": 0.00314159388653934, "skip_count": 0.0, "step": 1328, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.2293515358361775, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.8125, "learning_rate": 0.0009777635892728592, "loss": 0.1257, "macro_f1": 0.5492662787437439, "num_tokens": 2103214.0, "repeat_count": 0.0, "routers_loss": 0.04726361483335495, "skip_count": 2.0, "step": 1330, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.2402730375426625, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 8.375, "learning_rate": 0.0009776569744255042, "loss": 0.1294, "macro_f1": 0.5492662787437439, "num_tokens": 2106042.0, "repeat_count": 0.0, "routers_loss": 0.020205877721309662, "skip_count": 1.0, "step": 1332, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 7.251194539249147, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.5, "learning_rate": 0.0009775501104416623, "loss": 0.0712, "macro_f1": 0.31446540355682373, "num_tokens": 2108888.0, "repeat_count": 0.0, "routers_loss": 0.05470820143818855, "skip_count": 1.0, "step": 1334, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 7.262116040955632, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 3.515625, "learning_rate": 0.000977442997377072, "loss": 0.0443, "macro_f1": 0.7795917987823486, "num_tokens": 2111969.0, "repeat_count": 1.0, "routers_loss": 0.13747620582580566, "skip_count": 2.0, "step": 1336, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 7.273037542662116, "f1_execute": 0.95652174949646, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.078125, "learning_rate": 0.000977335635287601, "loss": 0.0727, "macro_f1": 0.8743962049484253, "num_tokens": 2114882.0, "repeat_count": 2.0, "routers_loss": 0.01738063246011734, "skip_count": 3.0, "step": 1338, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.283959044368601, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0859375, "learning_rate": 0.000977228024229247, "loss": 0.0388, "macro_f1": 0.3333333432674408, "num_tokens": 2117566.0, "repeat_count": 0.0, "routers_loss": 0.005629166029393673, "skip_count": 0.0, "step": 1340, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.294880546075086, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.40625, "learning_rate": 0.0009771201642581385, "loss": 0.0392, "macro_f1": 0.3333333432674408, "num_tokens": 2121698.0, "repeat_count": 0.0, "routers_loss": 0.00142308056820184, "skip_count": 0.0, "step": 1342, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6000000238418579, "avg_layers": 28.0, "epoch": 7.30580204778157, "f1_execute": 0.9523809552192688, "f1_repeat": 1.0, "f1_skip": 0.75, "grad_norm": 2.265625, "learning_rate": 0.0009770120554305325, "loss": 0.0432, "macro_f1": 0.9007936716079712, "num_tokens": 2124686.0, "repeat_count": 3.0, "routers_loss": 0.35140305757522583, "skip_count": 5.0, "step": 1344, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.316723549488055, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.203125, "learning_rate": 0.0009769036978028172, "loss": 0.081, "macro_f1": 0.5492662787437439, "num_tokens": 2129166.0, "repeat_count": 0.0, "routers_loss": 0.028780370950698853, "skip_count": 1.0, "step": 1346, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.327645051194539, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.28125, "learning_rate": 0.000976795091431509, "loss": 0.0596, "macro_f1": 0.32098767161369324, "num_tokens": 2132432.0, "repeat_count": 0.0, "routers_loss": 0.03611215204000473, "skip_count": 1.0, "step": 1348, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.338566552901024, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.765625, "learning_rate": 0.0009766862363732552, "loss": 0.0832, "macro_f1": 0.6666666865348816, "num_tokens": 2135639.0, "repeat_count": 0.0, "routers_loss": 0.005299614276736975, "skip_count": 2.0, "step": 1350, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.349488054607509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.640625, "learning_rate": 0.0009765771326848325, "loss": 0.1078, "macro_f1": 0.3333333432674408, "num_tokens": 2137976.0, "repeat_count": 0.0, "routers_loss": 0.0004135340277571231, "skip_count": 0.0, "step": 1352, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.360409556313993, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.078125, "learning_rate": 0.0009764677804231472, "loss": 0.0644, "macro_f1": 0.3333333432674408, "num_tokens": 2140628.0, "repeat_count": 0.0, "routers_loss": 0.009225958958268166, "skip_count": 0.0, "step": 1354, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 29.0, "epoch": 7.371331058020478, "f1_execute": 0.9387754797935486, "f1_repeat": 0.6666666865348816, "f1_skip": 0.5, "grad_norm": 3.59375, "learning_rate": 0.0009763581796452353, "loss": 0.0541, "macro_f1": 0.7018141150474548, "num_tokens": 2143844.0, "repeat_count": 1.0, "routers_loss": 0.11593814194202423, "skip_count": 3.0, "step": 1356, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.382252559726963, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.703125, "learning_rate": 0.0009762483304082624, "loss": 0.0621, "macro_f1": 0.32098764181137085, "num_tokens": 2147026.0, "repeat_count": 0.0, "routers_loss": 0.09083937108516693, "skip_count": 2.0, "step": 1358, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.393174061433447, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.71875, "learning_rate": 0.0009761382327695236, "loss": 0.0612, "macro_f1": 0.3272727429866791, "num_tokens": 2149756.0, "repeat_count": 1.0, "routers_loss": 0.08116576820611954, "skip_count": 0.0, "step": 1360, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.404095563139932, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.828125, "learning_rate": 0.0009760278867864438, "loss": 0.0386, "macro_f1": 0.3272727429866791, "num_tokens": 2152967.0, "repeat_count": 0.0, "routers_loss": 0.011396174319088459, "skip_count": 0.0, "step": 1362, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.415017064846416, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.03125, "learning_rate": 0.0009759172925165773, "loss": 0.0681, "macro_f1": 0.3333333432674408, "num_tokens": 2155836.0, "repeat_count": 0.0, "routers_loss": 0.0005431486060842872, "skip_count": 0.0, "step": 1364, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.328125, "learning_rate": 0.0009758064500176078, "loss": 0.0808, "macro_f1": 0.3333333432674408, "num_tokens": 2158771.0, "repeat_count": 0.0, "routers_loss": 0.0003947779187001288, "skip_count": 0.0, "step": 1366, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.436860068259386, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5078125, "learning_rate": 0.0009756953593473487, "loss": 0.063, "macro_f1": 0.3272727429866791, "num_tokens": 2162386.0, "repeat_count": 1.0, "routers_loss": 0.07745275646448135, "skip_count": 0.0, "step": 1368, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 7.44778156996587, "f1_execute": 0.9200000166893005, "f1_repeat": 0.0, "f1_skip": 0.4000000059604645, "grad_norm": 3.5, "learning_rate": 0.0009755840205637425, "loss": 0.08, "macro_f1": 0.4400000274181366, "num_tokens": 2165307.0, "repeat_count": 1.0, "routers_loss": 0.11292161792516708, "skip_count": 3.0, "step": 1370, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.458703071672355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 4.59375, "learning_rate": 0.0009754724337248615, "loss": 0.0505, "macro_f1": 0.6666666865348816, "num_tokens": 2168182.0, "repeat_count": 0.0, "routers_loss": 0.003505303058773279, "skip_count": 2.0, "step": 1372, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.46962457337884, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.3203125, "learning_rate": 0.0009753605988889072, "loss": 0.0351, "macro_f1": 0.6666666865348816, "num_tokens": 2171187.0, "repeat_count": 0.0, "routers_loss": 0.0003129165852442384, "skip_count": 1.0, "step": 1374, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.480546075085324, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.25, "learning_rate": 0.0009752485161142102, "loss": 0.0629, "macro_f1": 0.5492662787437439, "num_tokens": 2174496.0, "repeat_count": 0.0, "routers_loss": 0.020839953795075417, "skip_count": 2.0, "step": 1376, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.491467576791809, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5859375, "learning_rate": 0.0009751361854592311, "loss": 0.0603, "macro_f1": 0.3333333432674408, "num_tokens": 2177601.0, "repeat_count": 0.0, "routers_loss": 0.0020428141579031944, "skip_count": 0.0, "step": 1378, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.502389078498293, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.203125, "learning_rate": 0.0009750236069825592, "loss": 0.0596, "macro_f1": 0.5492662787437439, "num_tokens": 2181421.0, "repeat_count": 0.0, "routers_loss": 0.14440438151359558, "skip_count": 2.0, "step": 1380, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.513310580204778, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.578125, "learning_rate": 0.000974910780742913, "loss": 0.0468, "macro_f1": 0.5492662787437439, "num_tokens": 2184378.0, "repeat_count": 0.0, "routers_loss": 0.03232918307185173, "skip_count": 2.0, "step": 1382, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.524232081911263, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.1875, "learning_rate": 0.0009747977067991405, "loss": 0.0391, "macro_f1": 0.3333333432674408, "num_tokens": 2187975.0, "repeat_count": 0.0, "routers_loss": 0.0005432961042970419, "skip_count": 0.0, "step": 1384, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.535153583617747, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0625, "learning_rate": 0.0009746843852102189, "loss": 0.0524, "macro_f1": 0.3333333432674408, "num_tokens": 2191848.0, "repeat_count": 0.0, "routers_loss": 0.0045736003667116165, "skip_count": 0.0, "step": 1386, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.546075085324232, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 3.84375, "learning_rate": 0.0009745708160352549, "loss": 0.0608, "macro_f1": 1.0, "num_tokens": 2194502.0, "repeat_count": 1.0, "routers_loss": 0.002219662768766284, "skip_count": 1.0, "step": 1388, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.556996587030717, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.953125, "learning_rate": 0.0009744569993334833, "loss": 0.0447, "macro_f1": 0.3333333432674408, "num_tokens": 2197666.0, "repeat_count": 0.0, "routers_loss": 0.0019073592266067863, "skip_count": 0.0, "step": 1390, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.567918088737201, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5703125, "learning_rate": 0.0009743429351642692, "loss": 0.042, "macro_f1": 0.3272727429866791, "num_tokens": 2200395.0, "repeat_count": 0.0, "routers_loss": 0.028218531981110573, "skip_count": 0.0, "step": 1392, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.578839590443686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.21875, "learning_rate": 0.0009742286235871058, "loss": 0.0486, "macro_f1": 0.3333333432674408, "num_tokens": 2203180.0, "repeat_count": 0.0, "routers_loss": 0.0035933530889451504, "skip_count": 0.0, "step": 1394, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 7.58976109215017, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 3.03125, "learning_rate": 0.000974114064661616, "loss": 0.0746, "macro_f1": 0.9265305995941162, "num_tokens": 2206602.0, "repeat_count": 1.0, "routers_loss": 0.037048108875751495, "skip_count": 3.0, "step": 1396, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.600682593856655, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.5625, "learning_rate": 0.0009739992584475515, "loss": 0.0497, "macro_f1": 0.3272727429866791, "num_tokens": 2208946.0, "repeat_count": 0.0, "routers_loss": 0.0377325601875782, "skip_count": 0.0, "step": 1398, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.611604095563139, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.53125, "learning_rate": 0.0009738842050047929, "loss": 0.0802, "macro_f1": 0.3272727429866791, "num_tokens": 2212094.0, "repeat_count": 0.0, "routers_loss": 0.04149351641535759, "skip_count": 0.0, "step": 1400, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.25, "avg_layers": 28.0, "epoch": 7.622525597269624, "f1_execute": 0.9387754797935486, "f1_repeat": 1.0, "f1_skip": 0.4000000059604645, "grad_norm": 2.484375, "learning_rate": 0.0009737689043933498, "loss": 0.1071, "macro_f1": 0.7795917987823486, "num_tokens": 2214792.0, "repeat_count": 1.0, "routers_loss": 0.06268326938152313, "skip_count": 4.0, "step": 1402, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.633447098976109, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 1.9921875, "learning_rate": 0.0009736533566733608, "loss": 0.0484, "macro_f1": 1.0, "num_tokens": 2217854.0, "repeat_count": 1.0, "routers_loss": 0.007443991489708424, "skip_count": 1.0, "step": 1404, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 25.0, "epoch": 7.6443686006825935, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.34375, "learning_rate": 0.0009735375619050932, "loss": 0.0983, "macro_f1": 0.3144654333591461, "num_tokens": 2221150.0, "repeat_count": 0.0, "routers_loss": 0.12694531679153442, "skip_count": 0.0, "step": 1406, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.6552901023890785, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.96875, "learning_rate": 0.0009734215201489434, "loss": 0.0831, "macro_f1": 0.3272727429866791, "num_tokens": 2224766.0, "repeat_count": 0.0, "routers_loss": 0.03844614699482918, "skip_count": 1.0, "step": 1408, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.6662116040955635, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.203125, "learning_rate": 0.0009733052314654364, "loss": 0.0948, "macro_f1": 0.5492662787437439, "num_tokens": 2228242.0, "repeat_count": 0.0, "routers_loss": 0.02449573017656803, "skip_count": 2.0, "step": 1410, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.6771331058020476, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.0, "learning_rate": 0.0009731886959152261, "loss": 0.0578, "macro_f1": 0.3333333432674408, "num_tokens": 2232724.0, "repeat_count": 0.0, "routers_loss": 0.0008567043696530163, "skip_count": 0.0, "step": 1412, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.6880546075085325, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.9375, "learning_rate": 0.0009730719135590953, "loss": 0.0312, "macro_f1": 0.3333333432674408, "num_tokens": 2235695.0, "repeat_count": 0.0, "routers_loss": 0.005676566623151302, "skip_count": 0.0, "step": 1414, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.6989761092150175, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.359375, "learning_rate": 0.0009729548844579552, "loss": 0.0627, "macro_f1": 0.3333333432674408, "num_tokens": 2239510.0, "repeat_count": 0.0, "routers_loss": 0.0011932249180972576, "skip_count": 0.0, "step": 1416, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.709897610921502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.75, "learning_rate": 0.0009728376086728459, "loss": 0.0612, "macro_f1": 0.6666666865348816, "num_tokens": 2242774.0, "repeat_count": 0.0, "routers_loss": 0.005993308965116739, "skip_count": 2.0, "step": 1418, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 7.720819112627987, "f1_execute": 0.9411765336990356, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.5, "learning_rate": 0.0009727200862649363, "loss": 0.0689, "macro_f1": 0.480392187833786, "num_tokens": 2246460.0, "repeat_count": 1.0, "routers_loss": 0.11321140825748444, "skip_count": 3.0, "step": 1420, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 7.731740614334471, "f1_execute": 0.95652174949646, "f1_repeat": 0.6666666865348816, "f1_skip": 0.8571428656578064, "grad_norm": 1.9375, "learning_rate": 0.0009726023172955237, "loss": 0.0607, "macro_f1": 0.8267771601676941, "num_tokens": 2249423.0, "repeat_count": 2.0, "routers_loss": 0.11932716518640518, "skip_count": 3.0, "step": 1422, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 28.0, "epoch": 7.742662116040956, "f1_execute": 0.9411764740943909, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 4.71875, "learning_rate": 0.0009724843018260339, "loss": 0.1129, "macro_f1": 0.4803921580314636, "num_tokens": 2251892.0, "repeat_count": 0.0, "routers_loss": 0.1626795083284378, "skip_count": 3.0, "step": 1424, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.753583617747441, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7578125, "learning_rate": 0.0009723660399180216, "loss": 0.0482, "macro_f1": 0.3333333432674408, "num_tokens": 2254640.0, "repeat_count": 0.0, "routers_loss": 0.0006556808366440237, "skip_count": 0.0, "step": 1426, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.764505119453925, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.21875, "learning_rate": 0.0009722475316331701, "loss": 0.0554, "macro_f1": 0.5492662787437439, "num_tokens": 2257476.0, "repeat_count": 0.0, "routers_loss": 0.01195655670017004, "skip_count": 1.0, "step": 1428, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.77542662116041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.515625, "learning_rate": 0.0009721287770332905, "loss": 0.0976, "macro_f1": 0.6666666865348816, "num_tokens": 2261358.0, "repeat_count": 0.0, "routers_loss": 0.005643711891025305, "skip_count": 2.0, "step": 1430, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.786348122866894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.359375, "learning_rate": 0.0009720097761803233, "loss": 0.0474, "macro_f1": 0.6666666865348816, "num_tokens": 2264791.0, "repeat_count": 0.0, "routers_loss": 0.013344124890863895, "skip_count": 2.0, "step": 1432, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.797269624573379, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.203125, "learning_rate": 0.000971890529136337, "loss": 0.0593, "macro_f1": 0.5492662787437439, "num_tokens": 2267454.0, "repeat_count": 0.0, "routers_loss": 0.0102626858279109, "skip_count": 2.0, "step": 1434, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.808191126279864, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.734375, "learning_rate": 0.0009717710359635281, "loss": 0.0565, "macro_f1": 0.3272727429866791, "num_tokens": 2270797.0, "repeat_count": 0.0, "routers_loss": 0.009162090718746185, "skip_count": 0.0, "step": 1436, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.819112627986348, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.671875, "learning_rate": 0.0009716512967242221, "loss": 0.0631, "macro_f1": 0.3272727429866791, "num_tokens": 2274181.0, "repeat_count": 0.0, "routers_loss": 0.013584171421825886, "skip_count": 0.0, "step": 1438, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.830034129692833, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.75, "learning_rate": 0.0009715313114808729, "loss": 0.092, "macro_f1": 0.3333333432674408, "num_tokens": 2277003.0, "repeat_count": 0.0, "routers_loss": 0.0006492941174656153, "skip_count": 0.0, "step": 1440, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 7.840955631399318, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1015625, "learning_rate": 0.000971411080296062, "loss": 0.041, "macro_f1": 0.3272727429866791, "num_tokens": 2280295.0, "repeat_count": 0.0, "routers_loss": 0.02994344010949135, "skip_count": 0.0, "step": 1442, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.53125, "learning_rate": 0.0009712906032325001, "loss": 0.0577, "macro_f1": 0.3333333432674408, "num_tokens": 2282932.0, "repeat_count": 0.0, "routers_loss": 0.0002139290008926764, "skip_count": 0.0, "step": 1444, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.862798634812287, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5, "learning_rate": 0.0009711698803530253, "loss": 0.0677, "macro_f1": 0.32098767161369324, "num_tokens": 2286501.0, "repeat_count": 0.0, "routers_loss": 0.049987293779850006, "skip_count": 1.0, "step": 1446, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.873720136518771, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 3.453125, "learning_rate": 0.0009710489117206042, "loss": 0.078, "macro_f1": 0.6603773832321167, "num_tokens": 2289485.0, "repeat_count": 1.0, "routers_loss": 0.01052978727966547, "skip_count": 0.0, "step": 1448, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 7.884641638225256, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.875, "learning_rate": 0.0009709276973983322, "loss": 0.0702, "macro_f1": 1.0, "num_tokens": 2293253.0, "repeat_count": 1.0, "routers_loss": 0.0023461326491087675, "skip_count": 1.0, "step": 1450, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 7.895563139931741, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.9765625, "learning_rate": 0.0009708062374494317, "loss": 0.0588, "macro_f1": 0.6666666865348816, "num_tokens": 2296570.0, "repeat_count": 0.0, "routers_loss": 0.0006967078079469502, "skip_count": 1.0, "step": 1452, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.906484641638225, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.59375, "learning_rate": 0.0009706845319372542, "loss": 0.0552, "macro_f1": 0.5492662787437439, "num_tokens": 2300945.0, "repeat_count": 0.0, "routers_loss": 0.022759471088647842, "skip_count": 2.0, "step": 1454, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 7.91740614334471, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.3984375, "learning_rate": 0.0009705625809252788, "loss": 0.0413, "macro_f1": 0.5492662787437439, "num_tokens": 2304058.0, "repeat_count": 0.0, "routers_loss": 0.011610173620283604, "skip_count": 1.0, "step": 1456, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.928327645051194, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4296875, "learning_rate": 0.0009704403844771128, "loss": 0.0557, "macro_f1": 0.3272727429866791, "num_tokens": 2307217.0, "repeat_count": 0.0, "routers_loss": 0.04158193618059158, "skip_count": 1.0, "step": 1458, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.939249146757679, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.484375, "learning_rate": 0.0009703179426564912, "loss": 0.0489, "macro_f1": 0.3333333432674408, "num_tokens": 2309861.0, "repeat_count": 0.0, "routers_loss": 0.0030414776410907507, "skip_count": 0.0, "step": 1460, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 7.950170648464164, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8125, "learning_rate": 0.0009701952555272776, "loss": 0.0631, "macro_f1": 0.3272727429866791, "num_tokens": 2312887.0, "repeat_count": 0.0, "routers_loss": 0.025799794122576714, "skip_count": 0.0, "step": 1462, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 7.961092150170648, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 6.375, "learning_rate": 0.0009700723231534631, "loss": 0.0971, "macro_f1": 0.5427350401878357, "num_tokens": 2316206.0, "repeat_count": 1.0, "routers_loss": 0.09529697149991989, "skip_count": 2.0, "step": 1464, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 7.972013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.640625, "learning_rate": 0.0009699491455991666, "loss": 0.0758, "macro_f1": 0.6666666865348816, "num_tokens": 2318979.0, "repeat_count": 0.0, "routers_loss": 0.00970232393592596, "skip_count": 4.0, "step": 1466, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.982935153583618, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.171875, "learning_rate": 0.0009698257229286354, "loss": 0.0776, "macro_f1": 0.3272727429866791, "num_tokens": 2322057.0, "repeat_count": 0.0, "routers_loss": 0.04689466580748558, "skip_count": 1.0, "step": 1468, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 7.993856655290102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.890625, "learning_rate": 0.0009697020552062439, "loss": 0.0443, "macro_f1": 0.3333333432674408, "num_tokens": 2325676.0, "repeat_count": 0.0, "routers_loss": 0.009762553498148918, "skip_count": 0.0, "step": 1470, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.0, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.515625, "learning_rate": 0.0009695781424964953, "loss": 0.0479, "macro_f1": 0.6666666865348816, "num_tokens": 2327328.0, "repeat_count": 0.0, "routers_loss": 0.009211843833327293, "skip_count": 1.0, "step": 1472, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.010921501706484, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 7.875, "learning_rate": 0.0009694539848640196, "loss": 0.1048, "macro_f1": 0.3333333432674408, "num_tokens": 2330236.0, "repeat_count": 0.0, "routers_loss": 0.010994589887559414, "skip_count": 0.0, "step": 1474, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.02184300341297, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.09375, "learning_rate": 0.0009693295823735753, "loss": 0.1089, "macro_f1": 0.3272727429866791, "num_tokens": 2333367.0, "repeat_count": 0.0, "routers_loss": 0.0315704308450222, "skip_count": 1.0, "step": 1476, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.032764505119454, "f1_execute": 0.943396270275116, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.375, "learning_rate": 0.000969204935090048, "loss": 0.0488, "macro_f1": 0.3144654333591461, "num_tokens": 2336208.0, "repeat_count": 1.0, "routers_loss": 0.24644848704338074, "skip_count": 2.0, "step": 1478, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.043686006825938, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.96875, "learning_rate": 0.0009690800430784516, "loss": 0.0501, "macro_f1": 0.6666666865348816, "num_tokens": 2340180.0, "repeat_count": 0.0, "routers_loss": 0.0002616599085740745, "skip_count": 1.0, "step": 1480, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.054607508532424, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.25, "learning_rate": 0.0009689549064039269, "loss": 0.0754, "macro_f1": 0.5492662787437439, "num_tokens": 2344432.0, "repeat_count": 0.0, "routers_loss": 0.064383864402771, "skip_count": 2.0, "step": 1482, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 8.065529010238908, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 7.09375, "learning_rate": 0.0009688295251317431, "loss": 0.1015, "macro_f1": 0.9262410998344421, "num_tokens": 2349493.0, "repeat_count": 2.0, "routers_loss": 0.014145736582577229, "skip_count": 3.0, "step": 1484, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 25.0, "epoch": 8.076450511945392, "f1_execute": 0.9599999785423279, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.703125, "learning_rate": 0.0009687038993272965, "loss": 0.0739, "macro_f1": 0.542222261428833, "num_tokens": 2352950.0, "repeat_count": 0.0, "routers_loss": 0.03079567849636078, "skip_count": 3.0, "step": 1486, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 8.087372013651876, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.75, "learning_rate": 0.0009685780290561108, "loss": 0.0824, "macro_f1": 0.8823530077934265, "num_tokens": 2356201.0, "repeat_count": 1.0, "routers_loss": 0.01956949196755886, "skip_count": 2.0, "step": 1488, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 8.098293515358362, "f1_execute": 0.9615384340286255, "f1_repeat": 0.5, "f1_skip": 0.0, "grad_norm": 1.03125, "learning_rate": 0.0009684519143838379, "loss": 0.0197, "macro_f1": 0.4871794879436493, "num_tokens": 2359144.0, "repeat_count": 2.0, "routers_loss": 0.07746068388223648, "skip_count": 0.0, "step": 1490, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.109215017064846, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.3359375, "learning_rate": 0.0009683255553762564, "loss": 0.0665, "macro_f1": 0.5492662787437439, "num_tokens": 2362880.0, "repeat_count": 0.0, "routers_loss": 0.14250127971172333, "skip_count": 2.0, "step": 1492, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.12013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.4375, "learning_rate": 0.0009681989520992729, "loss": 0.0642, "macro_f1": 0.3333333432674408, "num_tokens": 2365271.0, "repeat_count": 0.0, "routers_loss": 0.004165742080658674, "skip_count": 0.0, "step": 1494, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.131058020477816, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.9296875, "learning_rate": 0.0009680721046189209, "loss": 0.0896, "macro_f1": 0.5492662787437439, "num_tokens": 2368339.0, "repeat_count": 0.0, "routers_loss": 0.14025560021400452, "skip_count": 2.0, "step": 1496, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.1419795221843, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.1875, "learning_rate": 0.0009679450130013618, "loss": 0.0277, "macro_f1": 0.6666666865348816, "num_tokens": 2371213.0, "repeat_count": 1.0, "routers_loss": 0.002212546532973647, "skip_count": 0.0, "step": 1498, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.152901023890784, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.109375, "learning_rate": 0.000967817677312884, "loss": 0.0515, "macro_f1": 0.5492662787437439, "num_tokens": 2373752.0, "repeat_count": 0.0, "routers_loss": 0.040635209530591965, "skip_count": 2.0, "step": 1500, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.16382252559727, "f1_execute": 0.9200000166893005, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 4.15625, "learning_rate": 0.0009676900976199033, "loss": 0.0697, "macro_f1": 0.64000004529953, "num_tokens": 2376748.0, "repeat_count": 1.0, "routers_loss": 0.16052690148353577, "skip_count": 2.0, "step": 1502, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.174744027303754, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1015625, "learning_rate": 0.0009675622739889626, "loss": 0.0261, "macro_f1": 0.3272727429866791, "num_tokens": 2380488.0, "repeat_count": 0.0, "routers_loss": 0.07952447980642319, "skip_count": 1.0, "step": 1504, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 8.185665529010238, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.328125, "learning_rate": 0.0009674342064867325, "loss": 0.0481, "macro_f1": 0.8823530077934265, "num_tokens": 2383596.0, "repeat_count": 1.0, "routers_loss": 0.03656064346432686, "skip_count": 2.0, "step": 1506, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.3333333432674408, "avg_layers": 27.0, "epoch": 8.196587030716724, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 1.09375, "learning_rate": 0.0009673058951800104, "loss": 0.05, "macro_f1": 0.4871794879436493, "num_tokens": 2387032.0, "repeat_count": 0.0, "routers_loss": 0.035941414535045624, "skip_count": 3.0, "step": 1508, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.207508532423208, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.97265625, "learning_rate": 0.0009671773401357209, "loss": 0.0622, "macro_f1": 0.6666666865348816, "num_tokens": 2390194.0, "repeat_count": 0.0, "routers_loss": 0.0015697282506152987, "skip_count": 1.0, "step": 1510, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.218430034129693, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.9296875, "learning_rate": 0.0009670485414209156, "loss": 0.0664, "macro_f1": 0.3333333432674408, "num_tokens": 2393400.0, "repeat_count": 0.0, "routers_loss": 0.0006098762387409806, "skip_count": 0.0, "step": 1512, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.229351535836177, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.75, "learning_rate": 0.0009669194991027737, "loss": 0.0427, "macro_f1": 1.0, "num_tokens": 2396539.0, "repeat_count": 2.0, "routers_loss": 0.0035735557321459055, "skip_count": 2.0, "step": 1514, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.240273037542662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.46875, "learning_rate": 0.0009667902132486009, "loss": 0.0589, "macro_f1": 0.3333333432674408, "num_tokens": 2399108.0, "repeat_count": 0.0, "routers_loss": 0.0025496529415249825, "skip_count": 0.0, "step": 1516, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.251194539249147, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.390625, "learning_rate": 0.0009666606839258303, "loss": 0.054, "macro_f1": 0.3333333432674408, "num_tokens": 2401593.0, "repeat_count": 0.0, "routers_loss": 0.0028618371579796076, "skip_count": 0.0, "step": 1518, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.26211604095563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.25, "learning_rate": 0.0009665309112020218, "loss": 0.0809, "macro_f1": 0.6666666865348816, "num_tokens": 2404210.0, "repeat_count": 0.0, "routers_loss": 0.004981622099876404, "skip_count": 1.0, "step": 1520, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 8.273037542662117, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 1.796875, "learning_rate": 0.0009664008951448622, "loss": 0.0305, "macro_f1": 0.9452888369560242, "num_tokens": 2407696.0, "repeat_count": 1.0, "routers_loss": 0.019641229882836342, "skip_count": 3.0, "step": 1522, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 8.2839590443686, "f1_execute": 0.9803921580314636, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 2.4375, "learning_rate": 0.0009662706358221656, "loss": 0.0831, "macro_f1": 0.5934640765190125, "num_tokens": 2410803.0, "repeat_count": 2.0, "routers_loss": 0.058166686445474625, "skip_count": 0.0, "step": 1524, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.294880546075085, "f1_execute": 0.923076868057251, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.34375, "learning_rate": 0.0009661401333018725, "loss": 0.121, "macro_f1": 0.307692289352417, "num_tokens": 2414707.0, "repeat_count": 1.0, "routers_loss": 0.24183401465415955, "skip_count": 2.0, "step": 1526, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 31.0, "epoch": 8.30580204778157, "f1_execute": 0.9599999785423279, "f1_repeat": 0.800000011920929, "f1_skip": 0.0, "grad_norm": 1.5625, "learning_rate": 0.0009660093876520503, "loss": 0.0164, "macro_f1": 0.5866667032241821, "num_tokens": 2418612.0, "repeat_count": 2.0, "routers_loss": 0.035785969346761703, "skip_count": 1.0, "step": 1528, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.316723549488055, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 2.34375, "learning_rate": 0.0009658783989408935, "loss": 0.056, "macro_f1": 0.5427350401878357, "num_tokens": 2421451.0, "repeat_count": 1.0, "routers_loss": 0.11132128536701202, "skip_count": 0.0, "step": 1530, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.327645051194539, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.75, "learning_rate": 0.0009657471672367235, "loss": 0.0626, "macro_f1": 0.3272727429866791, "num_tokens": 2424406.0, "repeat_count": 0.0, "routers_loss": 0.009180179797112942, "skip_count": 0.0, "step": 1532, "text_loss": 0.0 }, { "acc_repeat": 0.6666666865348816, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 8.338566552901025, "f1_execute": 0.9333333373069763, "f1_repeat": 0.800000011920929, "f1_skip": 0.6666666865348816, "grad_norm": 1.78125, "learning_rate": 0.0009656156926079877, "loss": 0.0692, "macro_f1": 0.8000000715255737, "num_tokens": 2428674.0, "repeat_count": 3.0, "routers_loss": 0.9811075329780579, "skip_count": 4.0, "step": 1534, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.349488054607509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.796875, "learning_rate": 0.0009654839751232612, "loss": 0.0327, "macro_f1": 0.3333333432674408, "num_tokens": 2432003.0, "repeat_count": 0.0, "routers_loss": 0.0006731169996783137, "skip_count": 0.0, "step": 1536, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.360409556313993, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.59375, "learning_rate": 0.0009653520148512449, "loss": 0.0449, "macro_f1": 0.3272727429866791, "num_tokens": 2435258.0, "repeat_count": 0.0, "routers_loss": 0.010055837221443653, "skip_count": 0.0, "step": 1538, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 8.371331058020477, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.28125, "learning_rate": 0.0009652198118607666, "loss": 0.0359, "macro_f1": 0.6666666865348816, "num_tokens": 2438333.0, "repeat_count": 2.0, "routers_loss": 0.010459319688379765, "skip_count": 0.0, "step": 1540, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.382252559726963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.359375, "learning_rate": 0.0009650873662207811, "loss": 0.0495, "macro_f1": 0.3333333432674408, "num_tokens": 2441112.0, "repeat_count": 0.0, "routers_loss": 0.0003271556051913649, "skip_count": 0.0, "step": 1542, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.393174061433447, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 1.8671875, "learning_rate": 0.000964954678000369, "loss": 0.0525, "macro_f1": 1.0, "num_tokens": 2444029.0, "repeat_count": 1.0, "routers_loss": 0.00024354885681532323, "skip_count": 2.0, "step": 1544, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.404095563139931, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.0625, "learning_rate": 0.0009648217472687384, "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 2446912.0, "repeat_count": 0.0, "routers_loss": 0.00775564182549715, "skip_count": 0.0, "step": 1546, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.415017064846417, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 0.0009646885740952232, "loss": 0.0373, "macro_f1": 0.3333333432674408, "num_tokens": 2449809.0, "repeat_count": 0.0, "routers_loss": 0.0007458397885784507, "skip_count": 0.0, "step": 1548, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.425938566552901, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.9375, "learning_rate": 0.0009645551585492836, "loss": 0.0491, "macro_f1": 0.6666666865348816, "num_tokens": 2452291.0, "repeat_count": 0.0, "routers_loss": 0.005456595681607723, "skip_count": 2.0, "step": 1550, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.436860068259385, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.796875, "learning_rate": 0.0009644215007005068, "loss": 0.0393, "macro_f1": 0.5492662787437439, "num_tokens": 2455386.0, "repeat_count": 0.0, "routers_loss": 0.01841093599796295, "skip_count": 2.0, "step": 1552, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.447781569965871, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1796875, "learning_rate": 0.000964287600618606, "loss": 0.095, "macro_f1": 0.3333333432674408, "num_tokens": 2458903.0, "repeat_count": 0.0, "routers_loss": 0.002832385478541255, "skip_count": 0.0, "step": 1554, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.458703071672355, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.328125, "learning_rate": 0.0009641534583734211, "loss": 0.0341, "macro_f1": 1.0, "num_tokens": 2461663.0, "repeat_count": 1.0, "routers_loss": 0.0060501862317323685, "skip_count": 1.0, "step": 1556, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.46962457337884, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5625, "learning_rate": 0.0009640190740349176, "loss": 0.0898, "macro_f1": 0.3333333432674408, "num_tokens": 2464545.0, "repeat_count": 0.0, "routers_loss": 0.00016725732712075114, "skip_count": 0.0, "step": 1558, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.480546075085325, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.984375, "learning_rate": 0.000963884447673188, "loss": 0.0515, "macro_f1": 0.3333333432674408, "num_tokens": 2467401.0, "repeat_count": 0.0, "routers_loss": 0.0013396950671449304, "skip_count": 0.0, "step": 1560, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 25.0, "epoch": 8.491467576791809, "f1_execute": 0.936170220375061, "f1_repeat": 1.0, "f1_skip": 0.5714285373687744, "grad_norm": 3.1875, "learning_rate": 0.0009637495793584509, "loss": 0.0472, "macro_f1": 0.8358663320541382, "num_tokens": 2470364.0, "repeat_count": 1.0, "routers_loss": 0.10716627538204193, "skip_count": 3.0, "step": 1562, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.502389078498293, "f1_execute": 0.9811320900917053, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 2.703125, "learning_rate": 0.0009636144691610507, "loss": 0.0606, "macro_f1": 0.5492662787437439, "num_tokens": 2473233.0, "repeat_count": 2.0, "routers_loss": 0.030880870297551155, "skip_count": 0.0, "step": 1564, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.513310580204777, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.546875, "learning_rate": 0.0009634791171514584, "loss": 0.0332, "macro_f1": 0.6666666865348816, "num_tokens": 2476693.0, "repeat_count": 0.0, "routers_loss": 0.005689056124538183, "skip_count": 2.0, "step": 1566, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.524232081911263, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.90625, "learning_rate": 0.0009633435234002709, "loss": 0.0326, "macro_f1": 0.3272727429866791, "num_tokens": 2480488.0, "repeat_count": 0.0, "routers_loss": 0.0174152459949255, "skip_count": 1.0, "step": 1568, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 8.535153583617747, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 2.40625, "learning_rate": 0.0009632076879782111, "loss": 0.0548, "macro_f1": 0.6122449040412903, "num_tokens": 2484468.0, "repeat_count": 0.0, "routers_loss": 0.009632926434278488, "skip_count": 3.0, "step": 1570, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.546075085324231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8828125, "learning_rate": 0.0009630716109561283, "loss": 0.0317, "macro_f1": 0.3333333432674408, "num_tokens": 2488054.0, "repeat_count": 0.0, "routers_loss": 0.0008580191060900688, "skip_count": 0.0, "step": 1572, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.556996587030717, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.953125, "learning_rate": 0.0009629352924049974, "loss": 0.049, "macro_f1": 0.3333333432674408, "num_tokens": 2491012.0, "repeat_count": 0.0, "routers_loss": 0.0030345257837325335, "skip_count": 0.0, "step": 1574, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.567918088737201, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.6328125, "learning_rate": 0.0009627987323959196, "loss": 0.0446, "macro_f1": 0.5492662787437439, "num_tokens": 2494812.0, "repeat_count": 0.0, "routers_loss": 0.01736641675233841, "skip_count": 1.0, "step": 1576, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 8.578839590443685, "f1_execute": 0.9795917868614197, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 3.3125, "learning_rate": 0.0009626619310001219, "loss": 0.0467, "macro_f1": 0.8820862174034119, "num_tokens": 2498193.0, "repeat_count": 1.0, "routers_loss": 0.03686746954917908, "skip_count": 2.0, "step": 1578, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.589761092150171, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.265625, "learning_rate": 0.0009625248882889572, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 2500853.0, "repeat_count": 0.0, "routers_loss": 0.0002959762641694397, "skip_count": 0.0, "step": 1580, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.600682593856655, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.03125, "learning_rate": 0.0009623876043339044, "loss": 0.0536, "macro_f1": 0.3333333432674408, "num_tokens": 2504281.0, "repeat_count": 0.0, "routers_loss": 0.003022563410922885, "skip_count": 0.0, "step": 1582, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.61160409556314, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.375, "learning_rate": 0.0009622500792065678, "loss": 0.0214, "macro_f1": 0.5492662787437439, "num_tokens": 2507453.0, "repeat_count": 0.0, "routers_loss": 0.07192305475473404, "skip_count": 2.0, "step": 1584, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 8.622525597269625, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 2.921875, "learning_rate": 0.0009621123129786782, "loss": 0.0361, "macro_f1": 0.4871794879436493, "num_tokens": 2509961.0, "repeat_count": 0.0, "routers_loss": 0.2350051999092102, "skip_count": 1.0, "step": 1586, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.63344709897611, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.03125, "learning_rate": 0.0009619743057220913, "loss": 0.0582, "macro_f1": 0.3333333432674408, "num_tokens": 2513009.0, "repeat_count": 0.0, "routers_loss": 0.004353067837655544, "skip_count": 0.0, "step": 1588, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.644368600682593, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.8984375, "learning_rate": 0.0009618360575087895, "loss": 0.0447, "macro_f1": 0.6666666865348816, "num_tokens": 2515978.0, "repeat_count": 0.0, "routers_loss": 0.011450616642832756, "skip_count": 2.0, "step": 1590, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.655290102389078, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.8828125, "learning_rate": 0.00096169756841088, "loss": 0.0509, "macro_f1": 0.5492662787437439, "num_tokens": 2519143.0, "repeat_count": 0.0, "routers_loss": 0.17762798070907593, "skip_count": 1.0, "step": 1592, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.666211604095563, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.203125, "learning_rate": 0.000961558838500596, "loss": 0.0647, "macro_f1": 0.5427350401878357, "num_tokens": 2522328.0, "repeat_count": 1.0, "routers_loss": 0.08958186209201813, "skip_count": 2.0, "step": 1594, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 8.677133105802048, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8125, "learning_rate": 0.0009614198678502964, "loss": 0.0524, "macro_f1": 0.32098764181137085, "num_tokens": 2525902.0, "repeat_count": 0.0, "routers_loss": 0.1928669810295105, "skip_count": 0.0, "step": 1596, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.688054607508532, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.859375, "learning_rate": 0.0009612806565324656, "loss": 0.0301, "macro_f1": 0.3272727429866791, "num_tokens": 2529173.0, "repeat_count": 0.0, "routers_loss": 0.015153140760958195, "skip_count": 0.0, "step": 1598, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.698976109215018, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.859375, "learning_rate": 0.0009611412046197133, "loss": 0.0687, "macro_f1": 0.3272727429866791, "num_tokens": 2532265.0, "repeat_count": 0.0, "routers_loss": 0.07715301960706711, "skip_count": 1.0, "step": 1600, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.709897610921502, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.859375, "learning_rate": 0.000961001512184775, "loss": 0.0534, "macro_f1": 0.3333333432674408, "num_tokens": 2535400.0, "repeat_count": 0.0, "routers_loss": 0.0017306022346019745, "skip_count": 0.0, "step": 1602, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.720819112627986, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.25, "learning_rate": 0.0009608615793005116, "loss": 0.034, "macro_f1": 0.3333333432674408, "num_tokens": 2538445.0, "repeat_count": 0.0, "routers_loss": 0.007778775412589312, "skip_count": 0.0, "step": 1604, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.731740614334472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.734375, "learning_rate": 0.0009607214060399092, "loss": 0.0404, "macro_f1": 0.3333333432674408, "num_tokens": 2541317.0, "repeat_count": 0.0, "routers_loss": 0.0070070852525532246, "skip_count": 0.0, "step": 1606, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.742662116040956, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.0625, "learning_rate": 0.0009605809924760796, "loss": 0.034, "macro_f1": 0.32098764181137085, "num_tokens": 2544425.0, "repeat_count": 1.0, "routers_loss": 0.13846199214458466, "skip_count": 1.0, "step": 1608, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 8.75358361774744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.5625, "learning_rate": 0.0009604403386822594, "loss": 0.0578, "macro_f1": 0.6666666865348816, "num_tokens": 2547262.0, "repeat_count": 0.0, "routers_loss": 0.00024157771258614957, "skip_count": 1.0, "step": 1610, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 8.764505119453926, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.9140625, "learning_rate": 0.0009602994447318112, "loss": 0.0478, "macro_f1": 0.6666666865348816, "num_tokens": 2550692.0, "repeat_count": 2.0, "routers_loss": 0.0009380661649629474, "skip_count": 0.0, "step": 1612, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.77542662116041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.75, "learning_rate": 0.0009601583106982224, "loss": 0.0474, "macro_f1": 0.6666666865348816, "num_tokens": 2553371.0, "repeat_count": 0.0, "routers_loss": 0.005738384556025267, "skip_count": 2.0, "step": 1614, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.786348122866894, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.2890625, "learning_rate": 0.0009600169366551057, "loss": 0.0267, "macro_f1": 0.6666666865348816, "num_tokens": 2555823.0, "repeat_count": 0.0, "routers_loss": 0.00928535871207714, "skip_count": 2.0, "step": 1616, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.797269624573378, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.09375, "learning_rate": 0.0009598753226761991, "loss": 0.0448, "macro_f1": 0.3333333432674408, "num_tokens": 2558936.0, "repeat_count": 0.0, "routers_loss": 0.008234796114265919, "skip_count": 0.0, "step": 1618, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.808191126279864, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.3984375, "learning_rate": 0.0009597334688353657, "loss": 0.0431, "macro_f1": 0.6666666865348816, "num_tokens": 2562350.0, "repeat_count": 1.0, "routers_loss": 0.01044188067317009, "skip_count": 0.0, "step": 1620, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.819112627986348, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.7265625, "learning_rate": 0.0009595913752065934, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 2565345.0, "repeat_count": 0.0, "routers_loss": 0.006071321200579405, "skip_count": 0.0, "step": 1622, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 8.830034129692832, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.078125, "learning_rate": 0.0009594490418639957, "loss": 0.0474, "macro_f1": 0.6666666865348816, "num_tokens": 2568523.0, "repeat_count": 1.0, "routers_loss": 0.013989229686558247, "skip_count": 0.0, "step": 1624, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.840955631399318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.25, "learning_rate": 0.0009593064688818109, "loss": 0.073, "macro_f1": 0.3333333432674408, "num_tokens": 2571689.0, "repeat_count": 0.0, "routers_loss": 0.0025419918820261955, "skip_count": 0.0, "step": 1626, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.046875, "learning_rate": 0.0009591636563344022, "loss": 0.0526, "macro_f1": 0.6666666865348816, "num_tokens": 2574711.0, "repeat_count": 0.0, "routers_loss": 0.009592074900865555, "skip_count": 2.0, "step": 1628, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.862798634812286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.69921875, "learning_rate": 0.0009590206042962577, "loss": 0.0513, "macro_f1": 0.6666666865348816, "num_tokens": 2577855.0, "repeat_count": 0.0, "routers_loss": 0.0005241495091468096, "skip_count": 2.0, "step": 1630, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.873720136518772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.71875, "learning_rate": 0.0009588773128419905, "loss": 0.044, "macro_f1": 0.3333333432674408, "num_tokens": 2581205.0, "repeat_count": 0.0, "routers_loss": 0.003147244919091463, "skip_count": 0.0, "step": 1632, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.884641638225256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.265625, "learning_rate": 0.000958733782046339, "loss": 0.0464, "macro_f1": 0.3333333432674408, "num_tokens": 2585585.0, "repeat_count": 0.0, "routers_loss": 0.003980289679020643, "skip_count": 0.0, "step": 1634, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.89556313993174, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 4.25, "learning_rate": 0.0009585900119841657, "loss": 0.0721, "macro_f1": 0.3272727429866791, "num_tokens": 2588512.0, "repeat_count": 0.0, "routers_loss": 0.059653643518686295, "skip_count": 0.0, "step": 1636, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.906484641638226, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0078125, "learning_rate": 0.0009584460027304583, "loss": 0.0416, "macro_f1": 0.31446540355682373, "num_tokens": 2591618.0, "repeat_count": 0.0, "routers_loss": 0.055689550936222076, "skip_count": 1.0, "step": 1638, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 8.91740614334471, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 1.484375, "learning_rate": 0.0009583017543603289, "loss": 0.0475, "macro_f1": 0.9265305995941162, "num_tokens": 2594416.0, "repeat_count": 1.0, "routers_loss": 0.09039502590894699, "skip_count": 3.0, "step": 1640, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 8.928327645051194, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.09375, "learning_rate": 0.0009581572669490153, "loss": 0.0524, "macro_f1": 0.5492662787437439, "num_tokens": 2598062.0, "repeat_count": 0.0, "routers_loss": 0.021063080057501793, "skip_count": 2.0, "step": 1642, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 8.93924914675768, "f1_execute": 0.9599999785423279, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 1.5390625, "learning_rate": 0.0009580125405718787, "loss": 0.045, "macro_f1": 0.5866667032241821, "num_tokens": 2601214.0, "repeat_count": 0.0, "routers_loss": 0.035861629992723465, "skip_count": 3.0, "step": 1644, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.950170648464164, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.953125, "learning_rate": 0.0009578675753044059, "loss": 0.0497, "macro_f1": 0.3333333432674408, "num_tokens": 2603749.0, "repeat_count": 0.0, "routers_loss": 0.0010205218568444252, "skip_count": 0.0, "step": 1646, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 8.961092150170648, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.4375, "learning_rate": 0.0009577223712222076, "loss": 0.0457, "macro_f1": 0.6666666865348816, "num_tokens": 2606922.0, "repeat_count": 0.0, "routers_loss": 0.002696859184652567, "skip_count": 2.0, "step": 1648, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 8.972013651877132, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.34375, "learning_rate": 0.0009575769284010198, "loss": 0.0638, "macro_f1": 0.3272727429866791, "num_tokens": 2610140.0, "repeat_count": 0.0, "routers_loss": 0.011554194614291191, "skip_count": 0.0, "step": 1650, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.982935153583618, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4453125, "learning_rate": 0.0009574312469167023, "loss": 0.0307, "macro_f1": 0.3333333432674408, "num_tokens": 2613511.0, "repeat_count": 0.0, "routers_loss": 0.0027109351940453053, "skip_count": 0.0, "step": 1652, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 8.993856655290102, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.1875, "learning_rate": 0.0009572853268452398, "loss": 0.0355, "macro_f1": 0.3272727429866791, "num_tokens": 2616615.0, "repeat_count": 0.0, "routers_loss": 0.030503373593091965, "skip_count": 1.0, "step": 1654, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 28.0, "epoch": 9.0, "f1_execute": 0.978723406791687, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 3.3125, "learning_rate": 0.0009571391682627412, "loss": 0.0431, "macro_f1": 0.9262410998344421, "num_tokens": 2618244.0, "repeat_count": 2.0, "routers_loss": 0.03048447147011757, "skip_count": 3.0, "step": 1656, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.010921501706484, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.421875, "learning_rate": 0.0009569927712454401, "loss": 0.0624, "macro_f1": 0.5492662787437439, "num_tokens": 2621365.0, "repeat_count": 0.0, "routers_loss": 0.12667539715766907, "skip_count": 2.0, "step": 1658, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.02184300341297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.875, "learning_rate": 0.0009568461358696943, "loss": 0.0328, "macro_f1": 0.3333333432674408, "num_tokens": 2625393.0, "repeat_count": 0.0, "routers_loss": 0.006958470214158297, "skip_count": 0.0, "step": 1660, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.032764505119454, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4921875, "learning_rate": 0.0009566992622119859, "loss": 0.028, "macro_f1": 0.32098767161369324, "num_tokens": 2629314.0, "repeat_count": 1.0, "routers_loss": 0.05469416454434395, "skip_count": 0.0, "step": 1662, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.043686006825938, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.3125, "learning_rate": 0.0009565521503489214, "loss": 0.0578, "macro_f1": 0.3272727429866791, "num_tokens": 2632112.0, "repeat_count": 0.0, "routers_loss": 0.04393986612558365, "skip_count": 0.0, "step": 1664, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.054607508532424, "f1_execute": 0.9433962106704712, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.65625, "learning_rate": 0.0009564048003572312, "loss": 0.0489, "macro_f1": 0.31446540355682373, "num_tokens": 2635858.0, "repeat_count": 0.0, "routers_loss": 0.11510798335075378, "skip_count": 1.0, "step": 1666, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.75, "avg_layers": 25.0, "epoch": 9.065529010238908, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 0.8571428656578064, "grad_norm": 2.046875, "learning_rate": 0.0009562572123137705, "loss": 0.0428, "macro_f1": 0.6122449040412903, "num_tokens": 2638619.0, "repeat_count": 0.0, "routers_loss": 0.11790911853313446, "skip_count": 4.0, "step": 1668, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.076450511945392, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.015625, "learning_rate": 0.0009561093862955182, "loss": 0.0135, "macro_f1": 0.6666666865348816, "num_tokens": 2641838.0, "repeat_count": 0.0, "routers_loss": 0.008307491429150105, "skip_count": 2.0, "step": 1670, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.087372013651876, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.9296875, "learning_rate": 0.0009559613223795772, "loss": 0.0363, "macro_f1": 0.6666666865348816, "num_tokens": 2644835.0, "repeat_count": 0.0, "routers_loss": 0.0004966134438291192, "skip_count": 2.0, "step": 1672, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.098293515358362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.0078125, "learning_rate": 0.000955813020643175, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 2648134.0, "repeat_count": 0.0, "routers_loss": 0.0006114484276622534, "skip_count": 1.0, "step": 1674, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.109215017064846, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.078125, "learning_rate": 0.0009556644811636628, "loss": 0.0238, "macro_f1": 0.32098764181137085, "num_tokens": 2650884.0, "repeat_count": 0.0, "routers_loss": 0.1162124052643776, "skip_count": 2.0, "step": 1676, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.12013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 3.5, "learning_rate": 0.0009555157040185159, "loss": 0.0481, "macro_f1": 0.3333333432674408, "num_tokens": 2653657.0, "repeat_count": 0.0, "routers_loss": 0.002997790463268757, "skip_count": 0.0, "step": 1678, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.131058020477816, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.7734375, "learning_rate": 0.0009553666892853335, "loss": 0.0167, "macro_f1": 0.3333333432674408, "num_tokens": 2657099.0, "repeat_count": 0.0, "routers_loss": 0.004867739975452423, "skip_count": 0.0, "step": 1680, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.1419795221843, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.109375, "learning_rate": 0.0009552174370418388, "loss": 0.0163, "macro_f1": 0.3333333432674408, "num_tokens": 2660292.0, "repeat_count": 0.0, "routers_loss": 7.869673572713509e-05, "skip_count": 0.0, "step": 1682, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.152901023890784, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.609375, "learning_rate": 0.0009550679473658787, "loss": 0.0366, "macro_f1": 0.3272727429866791, "num_tokens": 2663686.0, "repeat_count": 1.0, "routers_loss": 0.14579863846302032, "skip_count": 0.0, "step": 1684, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.16382252559727, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.453125, "learning_rate": 0.0009549182203354241, "loss": 0.0258, "macro_f1": 0.32098767161369324, "num_tokens": 2666521.0, "repeat_count": 0.0, "routers_loss": 0.22247549891471863, "skip_count": 1.0, "step": 1686, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.174744027303754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5703125, "learning_rate": 0.0009547682560285699, "loss": 0.0237, "macro_f1": 0.3333333432674408, "num_tokens": 2669100.0, "repeat_count": 0.0, "routers_loss": 0.008106010966002941, "skip_count": 0.0, "step": 1688, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.185665529010238, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.171875, "learning_rate": 0.0009546180545235343, "loss": 0.0435, "macro_f1": 0.3272727429866791, "num_tokens": 2672370.0, "repeat_count": 0.0, "routers_loss": 0.21630284190177917, "skip_count": 1.0, "step": 1690, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.196587030716724, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.390625, "learning_rate": 0.0009544676158986596, "loss": 0.0361, "macro_f1": 0.3333333432674408, "num_tokens": 2675735.0, "repeat_count": 0.0, "routers_loss": 0.004031798802316189, "skip_count": 0.0, "step": 1692, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 9.207508532423208, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 1.390625, "learning_rate": 0.0009543169402324114, "loss": 0.0346, "macro_f1": 0.5934640765190125, "num_tokens": 2678910.0, "repeat_count": 0.0, "routers_loss": 0.027823299169540405, "skip_count": 3.0, "step": 1694, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.218430034129693, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.625, "learning_rate": 0.0009541660276033794, "loss": 0.0365, "macro_f1": 1.0, "num_tokens": 2682313.0, "repeat_count": 1.0, "routers_loss": 0.014843414537608624, "skip_count": 2.0, "step": 1696, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 9.229351535836177, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.453125, "learning_rate": 0.0009540148780902767, "loss": 0.0363, "macro_f1": 0.6666666865348816, "num_tokens": 2684992.0, "repeat_count": 0.0, "routers_loss": 0.009402195923030376, "skip_count": 3.0, "step": 1698, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.240273037542662, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2890625, "learning_rate": 0.0009538634917719397, "loss": 0.0262, "macro_f1": 0.3333333432674408, "num_tokens": 2688409.0, "repeat_count": 0.0, "routers_loss": 0.00543947983533144, "skip_count": 0.0, "step": 1700, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 9.251194539249147, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 2.484375, "learning_rate": 0.0009537118687273286, "loss": 0.056, "macro_f1": 0.5866667032241821, "num_tokens": 2691418.0, "repeat_count": 1.0, "routers_loss": 0.32384559512138367, "skip_count": 3.0, "step": 1702, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.26211604095563, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4609375, "learning_rate": 0.000953560009035527, "loss": 0.0433, "macro_f1": 0.3272727429866791, "num_tokens": 2694811.0, "repeat_count": 0.0, "routers_loss": 0.03660385683178902, "skip_count": 1.0, "step": 1704, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.273037542662117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.90234375, "learning_rate": 0.000953407912775742, "loss": 0.0161, "macro_f1": 0.3333333432674408, "num_tokens": 2698449.0, "repeat_count": 0.0, "routers_loss": 0.003029557643458247, "skip_count": 0.0, "step": 1706, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 9.2839590443686, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.6015625, "learning_rate": 0.0009532555800273039, "loss": 0.0561, "macro_f1": 0.542222261428833, "num_tokens": 2701485.0, "repeat_count": 0.0, "routers_loss": 0.2627774477005005, "skip_count": 4.0, "step": 1708, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.294880546075085, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.515625, "learning_rate": 0.0009531030108696668, "loss": 0.0439, "macro_f1": 0.3333333432674408, "num_tokens": 2704480.0, "repeat_count": 0.0, "routers_loss": 0.0007938223425298929, "skip_count": 0.0, "step": 1710, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 9.30580204778157, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 4.40625, "learning_rate": 0.0009529502053824073, "loss": 0.029, "macro_f1": 0.4871794879436493, "num_tokens": 2707179.0, "repeat_count": 0.0, "routers_loss": 0.17135916650295258, "skip_count": 1.0, "step": 1712, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.316723549488055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0, "learning_rate": 0.000952797163645226, "loss": 0.0353, "macro_f1": 0.3333333432674408, "num_tokens": 2709790.0, "repeat_count": 0.0, "routers_loss": 0.0020914478227496147, "skip_count": 0.0, "step": 1714, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.327645051194539, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5078125, "learning_rate": 0.0009526438857379463, "loss": 0.0323, "macro_f1": 0.3333333432674408, "num_tokens": 2712481.0, "repeat_count": 0.0, "routers_loss": 0.008548402227461338, "skip_count": 0.0, "step": 1716, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 9.338566552901025, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.421875, "learning_rate": 0.0009524903717405151, "loss": 0.0424, "macro_f1": 1.0, "num_tokens": 2715387.0, "repeat_count": 1.0, "routers_loss": 0.007046231999993324, "skip_count": 1.0, "step": 1718, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.349488054607509, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 3.640625, "learning_rate": 0.0009523366217330022, "loss": 0.0873, "macro_f1": 0.5492662787437439, "num_tokens": 2718506.0, "repeat_count": 0.0, "routers_loss": 0.027576079592108727, "skip_count": 2.0, "step": 1720, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.360409556313993, "f1_execute": 0.9599999785423279, "f1_repeat": 1.0, "f1_skip": 0.5, "grad_norm": 2.484375, "learning_rate": 0.0009521826357956008, "loss": 0.0499, "macro_f1": 0.8200000524520874, "num_tokens": 2721680.0, "repeat_count": 1.0, "routers_loss": 0.056622326374053955, "skip_count": 2.0, "step": 1722, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.371331058020477, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.0625, "learning_rate": 0.0009520284140086267, "loss": 0.0271, "macro_f1": 0.6666666865348816, "num_tokens": 2724823.0, "repeat_count": 2.0, "routers_loss": 0.00015587422240059823, "skip_count": 0.0, "step": 1724, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.382252559726963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.765625, "learning_rate": 0.000951873956452519, "loss": 0.0368, "macro_f1": 0.3333333432674408, "num_tokens": 2727739.0, "repeat_count": 0.0, "routers_loss": 0.00112772302236408, "skip_count": 0.0, "step": 1726, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.393174061433447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6328125, "learning_rate": 0.0009517192632078396, "loss": 0.042, "macro_f1": 0.3333333432674408, "num_tokens": 2730365.0, "repeat_count": 0.0, "routers_loss": 0.0017851196462288499, "skip_count": 0.0, "step": 1728, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 24.0, "epoch": 9.404095563139931, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.4375, "learning_rate": 0.0009515643343552738, "loss": 0.0422, "macro_f1": 0.542222261428833, "num_tokens": 2732878.0, "repeat_count": 0.0, "routers_loss": 0.020665084943175316, "skip_count": 2.0, "step": 1730, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 24.0, "epoch": 9.415017064846417, "f1_execute": 0.9130434393882751, "f1_repeat": 0.0, "f1_skip": 0.6000000238418579, "grad_norm": 2.1875, "learning_rate": 0.0009514091699756291, "loss": 0.0526, "macro_f1": 0.5043478012084961, "num_tokens": 2735772.0, "repeat_count": 0.0, "routers_loss": 0.21053816378116608, "skip_count": 6.0, "step": 1732, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.425938566552901, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.3046875, "learning_rate": 0.0009512537701498363, "loss": 0.03, "macro_f1": 0.8823530077934265, "num_tokens": 2738752.0, "repeat_count": 1.0, "routers_loss": 0.11412277817726135, "skip_count": 1.0, "step": 1734, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.436860068259385, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.46875, "learning_rate": 0.0009510981349589489, "loss": 0.0424, "macro_f1": 0.6666666865348816, "num_tokens": 2741445.0, "repeat_count": 0.0, "routers_loss": 0.000764985743444413, "skip_count": 1.0, "step": 1736, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 9.447781569965871, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 1.421875, "learning_rate": 0.0009509422644841432, "loss": 0.0421, "macro_f1": 1.0, "num_tokens": 2744416.0, "repeat_count": 1.0, "routers_loss": 0.00021090090740472078, "skip_count": 1.0, "step": 1738, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.458703071672355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.671875, "learning_rate": 0.0009507861588067181, "loss": 0.0518, "macro_f1": 0.3333333432674408, "num_tokens": 2747554.0, "repeat_count": 0.0, "routers_loss": 0.0030081600416451693, "skip_count": 0.0, "step": 1740, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.46962457337884, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.96875, "learning_rate": 0.0009506298180080954, "loss": 0.0457, "macro_f1": 0.3333333432674408, "num_tokens": 2750990.0, "repeat_count": 0.0, "routers_loss": 0.003131982171908021, "skip_count": 0.0, "step": 1742, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 9.480546075085325, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.34765625, "learning_rate": 0.0009504732421698192, "loss": 0.0242, "macro_f1": 1.0, "num_tokens": 2754645.0, "repeat_count": 1.0, "routers_loss": 0.0012672391021624207, "skip_count": 1.0, "step": 1744, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.491467576791809, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6953125, "learning_rate": 0.0009503164313735567, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 2758782.0, "repeat_count": 0.0, "routers_loss": 0.007686510682106018, "skip_count": 0.0, "step": 1746, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.502389078498293, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.25, "learning_rate": 0.0009501593857010968, "loss": 0.0362, "macro_f1": 0.3333333432674408, "num_tokens": 2762393.0, "repeat_count": 0.0, "routers_loss": 0.010150409303605556, "skip_count": 0.0, "step": 1748, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 9.513310580204777, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 2.140625, "learning_rate": 0.0009500021052343519, "loss": 0.0422, "macro_f1": 0.5934640765190125, "num_tokens": 2766252.0, "repeat_count": 0.0, "routers_loss": 0.18260988593101501, "skip_count": 3.0, "step": 1750, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 28.0, "epoch": 9.524232081911263, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 3.0625, "learning_rate": 0.0009498445900553561, "loss": 0.0232, "macro_f1": 0.9446290731430054, "num_tokens": 2768919.0, "repeat_count": 3.0, "routers_loss": 0.1400066763162613, "skip_count": 4.0, "step": 1752, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.535153583617747, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.0859375, "learning_rate": 0.0009496868402462664, "loss": 0.019, "macro_f1": 0.6666666865348816, "num_tokens": 2772006.0, "repeat_count": 0.0, "routers_loss": 0.006594743113964796, "skip_count": 1.0, "step": 1754, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.546075085324231, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3671875, "learning_rate": 0.0009495288558893619, "loss": 0.0195, "macro_f1": 0.3272727429866791, "num_tokens": 2774920.0, "repeat_count": 0.0, "routers_loss": 0.013973092660307884, "skip_count": 1.0, "step": 1756, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.556996587030717, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.98046875, "learning_rate": 0.0009493706370670441, "loss": 0.029, "macro_f1": 0.3333333432674408, "num_tokens": 2779435.0, "repeat_count": 0.0, "routers_loss": 0.0011850029695779085, "skip_count": 0.0, "step": 1758, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.567918088737201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.578125, "learning_rate": 0.0009492121838618371, "loss": 0.0596, "macro_f1": 0.6666666865348816, "num_tokens": 2782746.0, "repeat_count": 0.0, "routers_loss": 0.0003245700499974191, "skip_count": 2.0, "step": 1760, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.578839590443685, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 0.0009490534963563867, "loss": 0.0169, "macro_f1": 0.3333333432674408, "num_tokens": 2785207.0, "repeat_count": 0.0, "routers_loss": 0.0003590266569517553, "skip_count": 0.0, "step": 1762, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.589761092150171, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.69921875, "learning_rate": 0.0009488945746334613, "loss": 0.0686, "macro_f1": 0.6666666865348816, "num_tokens": 2788741.0, "repeat_count": 0.0, "routers_loss": 0.0021931170485913754, "skip_count": 2.0, "step": 1764, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.600682593856655, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.7109375, "learning_rate": 0.0009487354187759513, "loss": 0.0211, "macro_f1": 0.6601307392120361, "num_tokens": 2791613.0, "repeat_count": 1.0, "routers_loss": 0.00803866796195507, "skip_count": 2.0, "step": 1766, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.61160409556314, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.1796875, "learning_rate": 0.0009485760288668692, "loss": 0.021, "macro_f1": 0.5492662787437439, "num_tokens": 2794827.0, "repeat_count": 0.0, "routers_loss": 0.02076871134340763, "skip_count": 2.0, "step": 1768, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 9.622525597269625, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.2421875, "learning_rate": 0.0009484164049893498, "loss": 0.0209, "macro_f1": 0.542222261428833, "num_tokens": 2798022.0, "repeat_count": 0.0, "routers_loss": 0.03503800928592682, "skip_count": 4.0, "step": 1770, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 9.63344709897611, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.0625, "learning_rate": 0.0009482565472266498, "loss": 0.0658, "macro_f1": 1.0, "num_tokens": 2800563.0, "repeat_count": 1.0, "routers_loss": 0.0016337675042450428, "skip_count": 1.0, "step": 1772, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.644368600682593, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.625, "learning_rate": 0.0009480964556621476, "loss": 0.0274, "macro_f1": 0.3333333432674408, "num_tokens": 2803727.0, "repeat_count": 0.0, "routers_loss": 0.010637945495545864, "skip_count": 0.0, "step": 1774, "text_loss": 0.0 }, { "acc_repeat": 0.5, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 9.655290102389078, "f1_execute": 0.9600000381469727, "f1_repeat": 0.6666666865348816, "f1_skip": 0.6666666865348816, "grad_norm": 1.734375, "learning_rate": 0.000947936130379344, "loss": 0.0433, "macro_f1": 0.7644445300102234, "num_tokens": 2807005.0, "repeat_count": 2.0, "routers_loss": 0.18059520423412323, "skip_count": 2.0, "step": 1776, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.666211604095563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1328125, "learning_rate": 0.0009477755714618615, "loss": 0.0313, "macro_f1": 0.3333333432674408, "num_tokens": 2810487.0, "repeat_count": 0.0, "routers_loss": 0.0064196097664535046, "skip_count": 0.0, "step": 1778, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.677133105802048, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 8.3125, "learning_rate": 0.0009476147789934445, "loss": 0.0808, "macro_f1": 0.3272727429866791, "num_tokens": 2813372.0, "repeat_count": 0.0, "routers_loss": 0.015061851590871811, "skip_count": 0.0, "step": 1780, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.688054607508532, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.359375, "learning_rate": 0.0009474537530579591, "loss": 0.0714, "macro_f1": 0.3333333432674408, "num_tokens": 2816396.0, "repeat_count": 0.0, "routers_loss": 0.010107035748660564, "skip_count": 0.0, "step": 1782, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 9.698976109215018, "f1_execute": 0.9629629850387573, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.90625, "learning_rate": 0.0009472924937393934, "loss": 0.0442, "macro_f1": 0.32098767161369324, "num_tokens": 2820182.0, "repeat_count": 1.0, "routers_loss": 0.12446491420269012, "skip_count": 0.0, "step": 1784, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 9.709897610921502, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.515625, "learning_rate": 0.000947131001121857, "loss": 0.0816, "macro_f1": 0.5427350401878357, "num_tokens": 2823400.0, "repeat_count": 0.0, "routers_loss": 0.08261986076831818, "skip_count": 2.0, "step": 1786, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.5, "avg_layers": 29.0, "epoch": 9.720819112627986, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.2421875, "learning_rate": 0.0009469692752895813, "loss": 0.0202, "macro_f1": 0.8820862174034119, "num_tokens": 2825936.0, "repeat_count": 2.0, "routers_loss": 0.023021897301077843, "skip_count": 2.0, "step": 1788, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.731740614334472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.984375, "learning_rate": 0.0009468073163269191, "loss": 0.041, "macro_f1": 0.3333333432674408, "num_tokens": 2829761.0, "repeat_count": 0.0, "routers_loss": 0.0007484396337531507, "skip_count": 0.0, "step": 1790, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 26.0, "epoch": 9.742662116040956, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.6171875, "learning_rate": 0.0009466451243183454, "loss": 0.0413, "macro_f1": 0.32098764181137085, "num_tokens": 2833014.0, "repeat_count": 0.0, "routers_loss": 0.16819913685321808, "skip_count": 0.0, "step": 1792, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 9.75358361774744, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.3125, "learning_rate": 0.0009464826993484561, "loss": 0.0628, "macro_f1": 0.6666666865348816, "num_tokens": 2836477.0, "repeat_count": 0.0, "routers_loss": 0.005633242893964052, "skip_count": 3.0, "step": 1794, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 9.764505119453926, "f1_execute": 0.9795917868614197, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.9375, "learning_rate": 0.000946320041501969, "loss": 0.0358, "macro_f1": 0.6598639488220215, "num_tokens": 2839617.0, "repeat_count": 1.0, "routers_loss": 0.1467967927455902, "skip_count": 3.0, "step": 1796, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 9.77542662116041, "f1_execute": 0.9615384340286255, "f1_repeat": 0.6666666865348816, "f1_skip": 0.0, "grad_norm": 0.75, "learning_rate": 0.0009461571508637232, "loss": 0.0327, "macro_f1": 0.5427350401878357, "num_tokens": 2843307.0, "repeat_count": 1.0, "routers_loss": 0.021760720759630203, "skip_count": 1.0, "step": 1798, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 9.786348122866894, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 1.8203125, "learning_rate": 0.0009459940275186791, "loss": 0.0378, "macro_f1": 0.5866667032241821, "num_tokens": 2845819.0, "repeat_count": 1.0, "routers_loss": 0.0846707671880722, "skip_count": 3.0, "step": 1800, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 9.797269624573378, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.65625, "learning_rate": 0.0009458306715519189, "loss": 0.042, "macro_f1": 0.5492662787437439, "num_tokens": 2848999.0, "repeat_count": 0.0, "routers_loss": 0.045253075659275055, "skip_count": 2.0, "step": 1802, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.808191126279864, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 1.671875, "learning_rate": 0.0009456670830486456, "loss": 0.0497, "macro_f1": 1.0, "num_tokens": 2852452.0, "repeat_count": 2.0, "routers_loss": 0.010481602512300014, "skip_count": 3.0, "step": 1804, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.819112627986348, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5234375, "learning_rate": 0.0009455032620941839, "loss": 0.0547, "macro_f1": 0.3333333432674408, "num_tokens": 2855738.0, "repeat_count": 0.0, "routers_loss": 0.007190074771642685, "skip_count": 0.0, "step": 1806, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.830034129692832, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.515625, "learning_rate": 0.0009453392087739796, "loss": 0.0413, "macro_f1": 0.3333333432674408, "num_tokens": 2858610.0, "repeat_count": 0.0, "routers_loss": 0.006349124480038881, "skip_count": 0.0, "step": 1808, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 28.0, "epoch": 9.840955631399318, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.640625, "learning_rate": 0.0009451749231735996, "loss": 0.0408, "macro_f1": 0.5427350401878357, "num_tokens": 2862382.0, "repeat_count": 0.0, "routers_loss": 0.07400884479284286, "skip_count": 2.0, "step": 1810, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.851877133105802, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.3203125, "learning_rate": 0.0009450104053787321, "loss": 0.0458, "macro_f1": 0.3272727429866791, "num_tokens": 2865554.0, "repeat_count": 0.0, "routers_loss": 0.011085378006100655, "skip_count": 1.0, "step": 1812, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.862798634812286, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.3046875, "learning_rate": 0.0009448456554751864, "loss": 0.0475, "macro_f1": 0.6666666865348816, "num_tokens": 2869395.0, "repeat_count": 0.0, "routers_loss": 0.0031208908185362816, "skip_count": 1.0, "step": 1814, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.800000011920929, "avg_layers": 23.0, "epoch": 9.873720136518772, "f1_execute": 0.9333332777023315, "f1_repeat": 0.0, "f1_skip": 0.8000000715255737, "grad_norm": 1.171875, "learning_rate": 0.0009446806735488926, "loss": 0.0394, "macro_f1": 0.5777778029441833, "num_tokens": 2872424.0, "repeat_count": 1.0, "routers_loss": 0.15345056354999542, "skip_count": 5.0, "step": 1816, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.884641638225256, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.03125, "learning_rate": 0.0009445154596859023, "loss": 0.0443, "macro_f1": 0.6666666865348816, "num_tokens": 2875656.0, "repeat_count": 0.0, "routers_loss": 0.00394861213862896, "skip_count": 1.0, "step": 1818, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.89556313993174, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2421875, "learning_rate": 0.0009443500139723878, "loss": 0.0417, "macro_f1": 0.3333333432674408, "num_tokens": 2878698.0, "repeat_count": 0.0, "routers_loss": 0.0076905800960958, "skip_count": 0.0, "step": 1820, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 9.906484641638226, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 1.9296875, "learning_rate": 0.0009441843364946419, "loss": 0.0535, "macro_f1": 0.5934640765190125, "num_tokens": 2881791.0, "repeat_count": 0.0, "routers_loss": 0.014517856761813164, "skip_count": 2.0, "step": 1822, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.91740614334471, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.90625, "learning_rate": 0.0009440184273390792, "loss": 0.0585, "macro_f1": 0.3333333432674408, "num_tokens": 2886465.0, "repeat_count": 0.0, "routers_loss": 0.008793013170361519, "skip_count": 0.0, "step": 1824, "text_loss": 0.0 }, { "acc_repeat": 0.75, "acc_skip": 0.75, "avg_layers": 28.0, "epoch": 9.928327645051194, "f1_execute": 0.9523809552192688, "f1_repeat": 0.8571428656578064, "f1_skip": 0.8571428656578064, "grad_norm": 3.546875, "learning_rate": 0.0009438522865922343, "loss": 0.1052, "macro_f1": 0.888888955116272, "num_tokens": 2889395.0, "repeat_count": 4.0, "routers_loss": 0.07290194928646088, "skip_count": 4.0, "step": 1826, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.93924914675768, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.671875, "learning_rate": 0.0009436859143407634, "loss": 0.0495, "macro_f1": 0.3333333432674408, "num_tokens": 2892474.0, "repeat_count": 0.0, "routers_loss": 0.006077466998249292, "skip_count": 0.0, "step": 1828, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.950170648464164, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.1796875, "learning_rate": 0.0009435193106714424, "loss": 0.0331, "macro_f1": 0.3333333432674408, "num_tokens": 2895330.0, "repeat_count": 0.0, "routers_loss": 0.005992726888507605, "skip_count": 0.0, "step": 1830, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 9.961092150170648, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.984375, "learning_rate": 0.0009433524756711691, "loss": 0.0432, "macro_f1": 0.6666666865348816, "num_tokens": 2898138.0, "repeat_count": 0.0, "routers_loss": 0.011334298178553581, "skip_count": 2.0, "step": 1832, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 9.972013651877132, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 3.046875, "learning_rate": 0.000943185409426961, "loss": 0.0356, "macro_f1": 0.6666666865348816, "num_tokens": 2901097.0, "repeat_count": 0.0, "routers_loss": 0.000504772353451699, "skip_count": 1.0, "step": 1834, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 9.982935153583618, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.625, "learning_rate": 0.0009430181120259564, "loss": 0.0302, "macro_f1": 0.3272727429866791, "num_tokens": 2904523.0, "repeat_count": 0.0, "routers_loss": 0.01906302012503147, "skip_count": 0.0, "step": 1836, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 9.993856655290102, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.328125, "learning_rate": 0.0009428505835554149, "loss": 0.0512, "macro_f1": 0.3333333432674408, "num_tokens": 2907623.0, "repeat_count": 0.0, "routers_loss": 0.0019522482762113214, "skip_count": 0.0, "step": 1838, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.0, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 4.65625, "learning_rate": 0.0009426828241027156, "loss": 0.0374, "macro_f1": 0.5492662787437439, "num_tokens": 2909160.0, "repeat_count": 0.0, "routers_loss": 0.07898300886154175, "skip_count": 2.0, "step": 1840, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.010921501706484, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.34375, "learning_rate": 0.0009425148337553586, "loss": 0.0231, "macro_f1": 0.3333333432674408, "num_tokens": 2912463.0, "repeat_count": 0.0, "routers_loss": 0.005622925236821175, "skip_count": 0.0, "step": 1842, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.02184300341297, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4375, "learning_rate": 0.0009423466126009645, "loss": 0.0212, "macro_f1": 0.3333333432674408, "num_tokens": 2915389.0, "repeat_count": 0.0, "routers_loss": 0.0016232851194217801, "skip_count": 0.0, "step": 1844, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.3333333432674408, "avg_layers": 26.0, "epoch": 10.032764505119454, "f1_execute": 0.9166666865348816, "f1_repeat": 1.0, "f1_skip": 0.3333333432674408, "grad_norm": 2.671875, "learning_rate": 0.000942178160727274, "loss": 0.0355, "macro_f1": 0.75, "num_tokens": 2918199.0, "repeat_count": 1.0, "routers_loss": 0.16088271141052246, "skip_count": 3.0, "step": 1846, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.043686006825938, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.640625, "learning_rate": 0.0009420094782221485, "loss": 0.0185, "macro_f1": 0.6603773832321167, "num_tokens": 2921138.0, "repeat_count": 1.0, "routers_loss": 0.014421635307371616, "skip_count": 0.0, "step": 1848, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.054607508532424, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.9140625, "learning_rate": 0.0009418405651735691, "loss": 0.0307, "macro_f1": 0.3333333432674408, "num_tokens": 2924215.0, "repeat_count": 0.0, "routers_loss": 0.004555100109428167, "skip_count": 0.0, "step": 1850, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.065529010238908, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.66796875, "learning_rate": 0.0009416714216696379, "loss": 0.0118, "macro_f1": 0.6666666865348816, "num_tokens": 2927381.0, "repeat_count": 0.0, "routers_loss": 0.005014689173549414, "skip_count": 1.0, "step": 1852, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.076450511945392, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.546875, "learning_rate": 0.0009415020477985767, "loss": 0.0286, "macro_f1": 0.5492662787437439, "num_tokens": 2930611.0, "repeat_count": 0.0, "routers_loss": 0.05079558119177818, "skip_count": 2.0, "step": 1854, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.087372013651876, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8203125, "learning_rate": 0.0009413324436487276, "loss": 0.064, "macro_f1": 0.3333333432674408, "num_tokens": 2933734.0, "repeat_count": 0.0, "routers_loss": 0.0009134297724813223, "skip_count": 0.0, "step": 1856, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.098293515358362, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.390625, "learning_rate": 0.0009411626093085527, "loss": 0.0211, "macro_f1": 0.3333333432674408, "num_tokens": 2936975.0, "repeat_count": 0.0, "routers_loss": 0.0007149233133532107, "skip_count": 0.0, "step": 1858, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.109215017064846, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.5546875, "learning_rate": 0.0009409925448666343, "loss": 0.0186, "macro_f1": 0.6666666865348816, "num_tokens": 2940450.0, "repeat_count": 0.0, "routers_loss": 0.0011649983935058117, "skip_count": 1.0, "step": 1860, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.12013651877133, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.55859375, "learning_rate": 0.0009408222504116747, "loss": 0.0252, "macro_f1": 0.3333333432674408, "num_tokens": 2943643.0, "repeat_count": 0.0, "routers_loss": 0.0003875679976772517, "skip_count": 0.0, "step": 1862, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 10.131058020477816, "f1_execute": 0.9615384340286255, "f1_repeat": 0.0, "f1_skip": 0.5, "grad_norm": 0.6875, "learning_rate": 0.0009406517260324961, "loss": 0.0172, "macro_f1": 0.4871794879436493, "num_tokens": 2947548.0, "repeat_count": 0.0, "routers_loss": 0.11171325296163559, "skip_count": 2.0, "step": 1864, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.1419795221843, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.359375, "learning_rate": 0.0009404809718180407, "loss": 0.032, "macro_f1": 0.3333333432674408, "num_tokens": 2951040.0, "repeat_count": 0.0, "routers_loss": 0.0003539674507919699, "skip_count": 0.0, "step": 1866, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 26.0, "epoch": 10.152901023890784, "f1_execute": 0.9387754797935486, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.734375, "learning_rate": 0.0009403099878573705, "loss": 0.0227, "macro_f1": 0.5351474285125732, "num_tokens": 2953495.0, "repeat_count": 1.0, "routers_loss": 0.10906704515218735, "skip_count": 4.0, "step": 1868, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.16382252559727, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.443359375, "learning_rate": 0.0009401387742396675, "loss": 0.0077, "macro_f1": 0.3333333432674408, "num_tokens": 2956418.0, "repeat_count": 0.0, "routers_loss": 0.0009214511956088245, "skip_count": 0.0, "step": 1870, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.174744027303754, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.87109375, "learning_rate": 0.0009399673310542332, "loss": 0.0181, "macro_f1": 0.3333333432674408, "num_tokens": 2959383.0, "repeat_count": 0.0, "routers_loss": 0.0008078901446424425, "skip_count": 0.0, "step": 1872, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 26.0, "epoch": 10.185665529010238, "f1_execute": 0.9600000381469727, "f1_repeat": 0.0, "f1_skip": 0.800000011920929, "grad_norm": 0.484375, "learning_rate": 0.0009397956583904889, "loss": 0.0187, "macro_f1": 0.5866667032241821, "num_tokens": 2962072.0, "repeat_count": 1.0, "routers_loss": 0.10393868386745453, "skip_count": 3.0, "step": 1874, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.196587030716724, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.046875, "learning_rate": 0.0009396237563379761, "loss": 0.0386, "macro_f1": 1.0, "num_tokens": 2965786.0, "repeat_count": 1.0, "routers_loss": 0.003928918391466141, "skip_count": 1.0, "step": 1876, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.207508532423208, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.5625, "learning_rate": 0.0009394516249863552, "loss": 0.0427, "macro_f1": 0.3333333432674408, "num_tokens": 2968617.0, "repeat_count": 0.0, "routers_loss": 0.0030222597997635603, "skip_count": 0.0, "step": 1878, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 10.218430034129693, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.76953125, "learning_rate": 0.0009392792644254065, "loss": 0.0125, "macro_f1": 0.9265305995941162, "num_tokens": 2971834.0, "repeat_count": 1.0, "routers_loss": 0.02788766287267208, "skip_count": 3.0, "step": 1880, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.229351535836177, "f1_execute": 0.9230769276618958, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.5, "learning_rate": 0.0009391066747450299, "loss": 0.0317, "macro_f1": 0.3076923191547394, "num_tokens": 2974905.0, "repeat_count": 1.0, "routers_loss": 0.33092963695526123, "skip_count": 3.0, "step": 1882, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 10.240273037542662, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.890625, "learning_rate": 0.0009389338560352449, "loss": 0.0216, "macro_f1": 0.3272727429866791, "num_tokens": 2978218.0, "repeat_count": 0.0, "routers_loss": 0.035585418343544006, "skip_count": 0.0, "step": 1884, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.251194539249147, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.9375, "learning_rate": 0.0009387608083861905, "loss": 0.0142, "macro_f1": 0.6603773832321167, "num_tokens": 2981158.0, "repeat_count": 1.0, "routers_loss": 0.030651550740003586, "skip_count": 1.0, "step": 1886, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 10.26211604095563, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.359375, "learning_rate": 0.0009385875318881245, "loss": 0.0274, "macro_f1": 0.3272727429866791, "num_tokens": 2983996.0, "repeat_count": 0.0, "routers_loss": 0.05268227308988571, "skip_count": 0.0, "step": 1888, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.273037542662117, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.5703125, "learning_rate": 0.000938414026631425, "loss": 0.0372, "macro_f1": 0.3333333432674408, "num_tokens": 2987328.0, "repeat_count": 0.0, "routers_loss": 0.0005302507197484374, "skip_count": 0.0, "step": 1890, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.2839590443686, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 4.6875, "learning_rate": 0.0009382402927065885, "loss": 0.0841, "macro_f1": 0.6666666865348816, "num_tokens": 2989890.0, "repeat_count": 0.0, "routers_loss": 0.0011037392541766167, "skip_count": 2.0, "step": 1892, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 25.0, "epoch": 10.294880546075085, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.54296875, "learning_rate": 0.0009380663302042313, "loss": 0.0167, "macro_f1": 0.6666666865348816, "num_tokens": 2993348.0, "repeat_count": 0.0, "routers_loss": 0.007559314835816622, "skip_count": 3.0, "step": 1894, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.30580204778157, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6796875, "learning_rate": 0.0009378921392150892, "loss": 0.0296, "macro_f1": 0.3333333432674408, "num_tokens": 2996331.0, "repeat_count": 0.0, "routers_loss": 0.004822882823646069, "skip_count": 0.0, "step": 1896, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.316723549488055, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.79296875, "learning_rate": 0.0009377177198300164, "loss": 0.023, "macro_f1": 0.3333333432674408, "num_tokens": 2999753.0, "repeat_count": 0.0, "routers_loss": 0.00028005297644995153, "skip_count": 0.0, "step": 1898, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.327645051194539, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.283203125, "learning_rate": 0.0009375430721399867, "loss": 0.02, "macro_f1": 0.3333333432674408, "num_tokens": 3003595.0, "repeat_count": 0.0, "routers_loss": 0.0004891012795269489, "skip_count": 0.0, "step": 1900, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.338566552901025, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 1.6328125, "learning_rate": 0.000937368196236093, "loss": 0.0316, "macro_f1": 0.5492662787437439, "num_tokens": 3006727.0, "repeat_count": 0.0, "routers_loss": 0.12135660648345947, "skip_count": 2.0, "step": 1902, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.349488054607509, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.6015625, "learning_rate": 0.000937193092209547, "loss": 0.0284, "macro_f1": 0.3333333432674408, "num_tokens": 3010960.0, "repeat_count": 0.0, "routers_loss": 0.0014633082319051027, "skip_count": 0.0, "step": 1904, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.360409556313993, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2734375, "learning_rate": 0.0009370177601516796, "loss": 0.0155, "macro_f1": 0.3333333432674408, "num_tokens": 3013885.0, "repeat_count": 0.0, "routers_loss": 0.0006136370939202607, "skip_count": 0.0, "step": 1906, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 29.0, "epoch": 10.371331058020477, "f1_execute": 0.9523809552192688, "f1_repeat": 0.8571428656578064, "f1_skip": 0.8571428656578064, "grad_norm": 1.0390625, "learning_rate": 0.0009368422001539405, "loss": 0.016, "macro_f1": 0.888888955116272, "num_tokens": 3017313.0, "repeat_count": 3.0, "routers_loss": 0.07522892951965332, "skip_count": 4.0, "step": 1908, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.382252559726963, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.53125, "learning_rate": 0.0009366664123078985, "loss": 0.0314, "macro_f1": 0.3333333432674408, "num_tokens": 3020255.0, "repeat_count": 0.0, "routers_loss": 0.00037478271406143904, "skip_count": 0.0, "step": 1910, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.393174061433447, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4140625, "learning_rate": 0.0009364903967052409, "loss": 0.0331, "macro_f1": 0.3333333432674408, "num_tokens": 3023977.0, "repeat_count": 0.0, "routers_loss": 0.0001930665603140369, "skip_count": 0.0, "step": 1912, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.5, "avg_layers": 27.0, "epoch": 10.404095563139931, "f1_execute": 0.9811320900917053, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.0625, "learning_rate": 0.000936314153437774, "loss": 0.0138, "macro_f1": 0.5492662787437439, "num_tokens": 3026913.0, "repeat_count": 0.0, "routers_loss": 0.034160565584897995, "skip_count": 2.0, "step": 1914, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.415017064846417, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 1.234375, "learning_rate": 0.0009361376825974229, "loss": 0.0137, "macro_f1": 1.0, "num_tokens": 3029882.0, "repeat_count": 1.0, "routers_loss": 0.0026114555075764656, "skip_count": 2.0, "step": 1916, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 27.0, "epoch": 10.425938566552901, "f1_execute": 0.9777777791023254, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 1.90625, "learning_rate": 0.0009359609842762314, "loss": 0.0114, "macro_f1": 0.9449735879898071, "num_tokens": 3032548.0, "repeat_count": 2.0, "routers_loss": 0.02721484750509262, "skip_count": 4.0, "step": 1918, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.436860068259385, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.234375, "learning_rate": 0.0009357840585663617, "loss": 0.0164, "macro_f1": 0.3333333432674408, "num_tokens": 3034991.0, "repeat_count": 0.0, "routers_loss": 0.0009023157763294876, "skip_count": 0.0, "step": 1920, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.75, "avg_layers": 28.0, "epoch": 10.447781569965871, "f1_execute": 0.9767441749572754, "f1_repeat": 1.0, "f1_skip": 0.8571428656578064, "grad_norm": 0.74609375, "learning_rate": 0.0009356069055600948, "loss": 0.0165, "macro_f1": 0.9446290731430054, "num_tokens": 3038157.0, "repeat_count": 3.0, "routers_loss": 0.017481215298175812, "skip_count": 4.0, "step": 1922, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.458703071672355, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.15625, "learning_rate": 0.0009354295253498301, "loss": 0.0138, "macro_f1": 0.6666666865348816, "num_tokens": 3041020.0, "repeat_count": 0.0, "routers_loss": 0.007951425388455391, "skip_count": 2.0, "step": 1924, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.46962457337884, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0390625, "learning_rate": 0.0009352519180280861, "loss": 0.0342, "macro_f1": 0.3333333432674408, "num_tokens": 3043950.0, "repeat_count": 0.0, "routers_loss": 0.001982828602194786, "skip_count": 0.0, "step": 1926, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.480546075085325, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 2.515625, "learning_rate": 0.0009350740836874987, "loss": 0.0463, "macro_f1": 1.0, "num_tokens": 3046539.0, "repeat_count": 1.0, "routers_loss": 0.00047102838288992643, "skip_count": 1.0, "step": 1928, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.6666666865348816, "avg_layers": 27.0, "epoch": 10.491467576791809, "f1_execute": 0.9795917868614197, "f1_repeat": 1.0, "f1_skip": 0.800000011920929, "grad_norm": 0.984375, "learning_rate": 0.0009348960224208233, "loss": 0.0111, "macro_f1": 0.9265305995941162, "num_tokens": 3049756.0, "repeat_count": 1.0, "routers_loss": 0.017243247479200363, "skip_count": 3.0, "step": 1930, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.502389078498293, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 0.369140625, "learning_rate": 0.0009347177343209328, "loss": 0.018, "macro_f1": 0.6666666865348816, "num_tokens": 3052427.0, "repeat_count": 1.0, "routers_loss": 0.00031657906947657466, "skip_count": 0.0, "step": 1932, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.513310580204777, "f1_execute": 0.9811320900917053, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.25, "learning_rate": 0.0009345392194808189, "loss": 0.0199, "macro_f1": 0.6603773832321167, "num_tokens": 3055833.0, "repeat_count": 1.0, "routers_loss": 0.03332769125699997, "skip_count": 1.0, "step": 1934, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.524232081911263, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.1875, "learning_rate": 0.0009343604779935915, "loss": 0.0418, "macro_f1": 0.6666666865348816, "num_tokens": 3058629.0, "repeat_count": 0.0, "routers_loss": 0.0007250228663906455, "skip_count": 2.0, "step": 1936, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.535153583617747, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.296875, "learning_rate": 0.0009341815099524784, "loss": 0.03, "macro_f1": 0.3272727429866791, "num_tokens": 3061668.0, "repeat_count": 0.0, "routers_loss": 0.02986256405711174, "skip_count": 1.0, "step": 1938, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.546075085324231, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.43359375, "learning_rate": 0.0009340023154508261, "loss": 0.0207, "macro_f1": 0.3333333432674408, "num_tokens": 3066289.0, "repeat_count": 0.0, "routers_loss": 0.00027556504937820137, "skip_count": 0.0, "step": 1940, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 28.0, "epoch": 10.556996587030717, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.78515625, "learning_rate": 0.0009338228945820988, "loss": 0.0321, "macro_f1": 1.0, "num_tokens": 3071000.0, "repeat_count": 1.0, "routers_loss": 0.003632819512858987, "skip_count": 1.0, "step": 1942, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.567918088737201, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.89453125, "learning_rate": 0.000933643247439879, "loss": 0.0241, "macro_f1": 0.3333333432674408, "num_tokens": 3074002.0, "repeat_count": 0.0, "routers_loss": 0.002939134370535612, "skip_count": 0.0, "step": 1944, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 10.578839590443685, "f1_execute": 0.9803921580314636, "f1_repeat": 0.6666666865348816, "f1_skip": 1.0, "grad_norm": 1.2109375, "learning_rate": 0.0009334633741178668, "loss": 0.0578, "macro_f1": 0.8823530077934265, "num_tokens": 3076683.0, "repeat_count": 1.0, "routers_loss": 0.15490587055683136, "skip_count": 1.0, "step": 1946, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.589761092150171, "f1_execute": 0.9629629254341125, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2421875, "learning_rate": 0.0009332832747098811, "loss": 0.0533, "macro_f1": 0.32098764181137085, "num_tokens": 3080859.0, "repeat_count": 0.0, "routers_loss": 0.10794607549905777, "skip_count": 2.0, "step": 1948, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.600682593856655, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.078125, "learning_rate": 0.0009331029493098577, "loss": 0.0183, "macro_f1": 0.6666666865348816, "num_tokens": 3084682.0, "repeat_count": 0.0, "routers_loss": 0.001257776515558362, "skip_count": 1.0, "step": 1950, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.61160409556314, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.421875, "learning_rate": 0.0009329223980118511, "loss": 0.0166, "macro_f1": 0.6666666865348816, "num_tokens": 3087642.0, "repeat_count": 0.0, "routers_loss": 0.003992061596363783, "skip_count": 1.0, "step": 1952, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.622525597269625, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.46875, "learning_rate": 0.0009327416209100332, "loss": 0.0291, "macro_f1": 0.3333333432674408, "num_tokens": 3091111.0, "repeat_count": 0.0, "routers_loss": 0.00031391315860673785, "skip_count": 0.0, "step": 1954, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.63344709897611, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.0625, "learning_rate": 0.0009325606180986939, "loss": 0.0369, "macro_f1": 0.3333333432674408, "num_tokens": 3094881.0, "repeat_count": 0.0, "routers_loss": 0.0010785188060253859, "skip_count": 0.0, "step": 1956, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.644368600682593, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 2.265625, "learning_rate": 0.0009323793896722403, "loss": 0.0341, "macro_f1": 0.6666666865348816, "num_tokens": 3097957.0, "repeat_count": 1.0, "routers_loss": 0.0014321994967758656, "skip_count": 0.0, "step": 1958, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 0.0, "avg_layers": 30.0, "epoch": 10.655290102389078, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 0.0, "grad_norm": 1.015625, "learning_rate": 0.0009321979357251981, "loss": 0.0215, "macro_f1": 0.6666666865348816, "num_tokens": 3101120.0, "repeat_count": 2.0, "routers_loss": 0.0008751509012654424, "skip_count": 0.0, "step": 1960, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.666211604095563, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 0.62109375, "learning_rate": 0.00093201625635221, "loss": 0.0132, "macro_f1": 0.3333333432674408, "num_tokens": 3103878.0, "repeat_count": 0.0, "routers_loss": 0.0029730487149208784, "skip_count": 0.0, "step": 1962, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.677133105802048, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.7578125, "learning_rate": 0.0009318343516480362, "loss": 0.018, "macro_f1": 0.6601307392120361, "num_tokens": 3107113.0, "repeat_count": 1.0, "routers_loss": 0.04190417379140854, "skip_count": 2.0, "step": 1964, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.688054607508532, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.8671875, "learning_rate": 0.0009316522217075547, "loss": 0.0467, "macro_f1": 0.3333333432674408, "num_tokens": 3110109.0, "repeat_count": 0.0, "routers_loss": 0.0013136990601196885, "skip_count": 0.0, "step": 1966, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.698976109215018, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.625, "learning_rate": 0.0009314698666257608, "loss": 0.0427, "macro_f1": 0.6601307392120361, "num_tokens": 3113902.0, "repeat_count": 1.0, "routers_loss": 0.021867308765649796, "skip_count": 2.0, "step": 1968, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 29.0, "epoch": 10.709897610921502, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.171875, "learning_rate": 0.0009312872864977676, "loss": 0.044, "macro_f1": 0.3272727429866791, "num_tokens": 3118294.0, "repeat_count": 0.0, "routers_loss": 0.08263491839170456, "skip_count": 0.0, "step": 1970, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 10.720819112627986, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.5546875, "learning_rate": 0.000931104481418805, "loss": 0.1169, "macro_f1": 0.3272727429866791, "num_tokens": 3121049.0, "repeat_count": 0.0, "routers_loss": 0.021012064069509506, "skip_count": 0.0, "step": 1972, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.731740614334472, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.0625, "learning_rate": 0.0009309214514842208, "loss": 0.0222, "macro_f1": 0.6666666865348816, "num_tokens": 3124066.0, "repeat_count": 0.0, "routers_loss": 0.011188115924596786, "skip_count": 1.0, "step": 1974, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.742662116040956, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.484375, "learning_rate": 0.0009307381967894796, "loss": 0.0369, "macro_f1": 0.6666666865348816, "num_tokens": 3127792.0, "repeat_count": 0.0, "routers_loss": 0.004685467109084129, "skip_count": 2.0, "step": 1976, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 27.0, "epoch": 10.75358361774744, "f1_execute": 0.9818181991577148, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.125, "learning_rate": 0.0009305547174301636, "loss": 0.0474, "macro_f1": 0.3272727429866791, "num_tokens": 3130947.0, "repeat_count": 0.0, "routers_loss": 0.03083117865025997, "skip_count": 0.0, "step": 1978, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 29.0, "epoch": 10.764505119453926, "f1_execute": 1.0, "f1_repeat": 1.0, "f1_skip": 1.0, "grad_norm": 0.953125, "learning_rate": 0.0009303710135019718, "loss": 0.0322, "macro_f1": 1.0, "num_tokens": 3133589.0, "repeat_count": 2.0, "routers_loss": 0.0010647289454936981, "skip_count": 1.0, "step": 1980, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.77542662116041, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.4765625, "learning_rate": 0.0009301870851007208, "loss": 0.034, "macro_f1": 0.3333333432674408, "num_tokens": 3136480.0, "repeat_count": 0.0, "routers_loss": 0.002559049753472209, "skip_count": 0.0, "step": 1982, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.786348122866894, "f1_execute": 0.9803921580314636, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 2.078125, "learning_rate": 0.000930002932322344, "loss": 0.0494, "macro_f1": 0.6601307392120361, "num_tokens": 3139357.0, "repeat_count": 1.0, "routers_loss": 0.053545739501714706, "skip_count": 2.0, "step": 1984, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.797269624573378, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 0.8125, "learning_rate": 0.0009298185552628918, "loss": 0.0152, "macro_f1": 0.6666666865348816, "num_tokens": 3142291.0, "repeat_count": 0.0, "routers_loss": 0.0008066387381404638, "skip_count": 1.0, "step": 1986, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.6666666865348816, "avg_layers": 25.0, "epoch": 10.808191126279864, "f1_execute": 0.9599999785423279, "f1_repeat": 0.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.71875, "learning_rate": 0.0009296339540185317, "loss": 0.0272, "macro_f1": 0.542222261428833, "num_tokens": 3145523.0, "repeat_count": 0.0, "routers_loss": 0.06776142120361328, "skip_count": 3.0, "step": 1988, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.819112627986348, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.46875, "learning_rate": 0.000929449128685548, "loss": 0.0377, "macro_f1": 0.3333333432674408, "num_tokens": 3148288.0, "repeat_count": 0.0, "routers_loss": 0.000135111651616171, "skip_count": 0.0, "step": 1990, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.830034129692832, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.2109375, "learning_rate": 0.0009292640793603419, "loss": 0.015, "macro_f1": 0.3333333432674408, "num_tokens": 3151380.0, "repeat_count": 0.0, "routers_loss": 0.0011454055784270167, "skip_count": 0.0, "step": 1992, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.840955631399318, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 1.734375, "learning_rate": 0.0009290788061394317, "loss": 0.0252, "macro_f1": 0.3333333432674408, "num_tokens": 3154326.0, "repeat_count": 0.0, "routers_loss": 0.0007376293069683015, "skip_count": 0.0, "step": 1994, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 1.0, "avg_layers": 26.0, "epoch": 10.851877133105802, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 1.0, "grad_norm": 1.7734375, "learning_rate": 0.0009288933091194522, "loss": 0.0264, "macro_f1": 0.6666666865348816, "num_tokens": 3157452.0, "repeat_count": 0.0, "routers_loss": 0.0024160572793334723, "skip_count": 2.0, "step": 1996, "text_loss": 0.0 }, { "acc_repeat": 1.0, "acc_skip": 1.0, "avg_layers": 27.0, "epoch": 10.862798634812286, "f1_execute": 0.9803921580314636, "f1_repeat": 1.0, "f1_skip": 0.6666666865348816, "grad_norm": 2.09375, "learning_rate": 0.000928707588397155, "loss": 0.0465, "macro_f1": 0.8823530077934265, "num_tokens": 3160086.0, "repeat_count": 1.0, "routers_loss": 0.010999886319041252, "skip_count": 1.0, "step": 1998, "text_loss": 0.0 }, { "acc_repeat": 0.0, "acc_skip": 0.0, "avg_layers": 28.0, "epoch": 10.873720136518772, "f1_execute": 1.0, "f1_repeat": 0.0, "f1_skip": 0.0, "grad_norm": 2.671875, "learning_rate": 0.0009285216440694083, "loss": 0.051, "macro_f1": 0.3333333432674408, "num_tokens": 3163708.0, "repeat_count": 0.0, "routers_loss": 0.004876194056123495, "skip_count": 0.0, "step": 2000, "text_loss": 0.0 } ], "logging_steps": 2, "max_steps": 9200, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.344603477156799e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }