| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.436860068259386, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.010921501706484642, |
| "f1_execute": 0.5142857432365417, |
| "f1_repeat": 0.2222222238779068, |
| "f1_skip": 0.0, |
| "grad_norm": 31.125, |
| "learning_rate": 2e-06, |
| "loss": 2.8198, |
| "macro_f1": 0.24550265073776245, |
| "num_tokens": 3507.0, |
| "repeat_count": 1.0, |
| "routers_loss": 1.076732873916626, |
| "skip_count": 2.0, |
| "step": 2, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.021843003412969283, |
| "f1_execute": 0.6666666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 38.5, |
| "learning_rate": 6e-06, |
| "loss": 3.125, |
| "macro_f1": 0.222222238779068, |
| "num_tokens": 7330.0, |
| "repeat_count": 0.0, |
| "routers_loss": 4.3143134117126465, |
| "skip_count": 0.0, |
| "step": 4, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.032764505119453925, |
| "f1_execute": 0.5999999642372131, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 33.75, |
| "learning_rate": 1e-05, |
| "loss": 3.0713, |
| "macro_f1": 0.19999998807907104, |
| "num_tokens": 11360.0, |
| "repeat_count": 0.0, |
| "routers_loss": 1.8818678855895996, |
| "skip_count": 0.0, |
| "step": 6, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.04368600682593857, |
| "f1_execute": 0.5789473652839661, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 37.25, |
| "learning_rate": 1.4e-05, |
| "loss": 2.992, |
| "macro_f1": 0.19298246502876282, |
| "num_tokens": 14241.0, |
| "repeat_count": 1.0, |
| "routers_loss": 2.340613603591919, |
| "skip_count": 1.0, |
| "step": 8, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.05460750853242321, |
| "f1_execute": 0.6666666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 34.5, |
| "learning_rate": 1.8e-05, |
| "loss": 3.0072, |
| "macro_f1": 0.222222238779068, |
| "num_tokens": 17520.0, |
| "repeat_count": 0.0, |
| "routers_loss": 1.7916433811187744, |
| "skip_count": 0.0, |
| "step": 10, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.06552901023890785, |
| "f1_execute": 0.6315789818763733, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 41.25, |
| "learning_rate": 2.2e-05, |
| "loss": 3.2227, |
| "macro_f1": 0.21052633225917816, |
| "num_tokens": 20401.0, |
| "repeat_count": 1.0, |
| "routers_loss": 2.2361459732055664, |
| "skip_count": 1.0, |
| "step": 12, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 0.07645051194539249, |
| "f1_execute": 0.5789473652839661, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.20000000298023224, |
| "grad_norm": 31.875, |
| "learning_rate": 2.6e-05, |
| "loss": 3.1809, |
| "macro_f1": 0.2596491277217865, |
| "num_tokens": 23722.0, |
| "repeat_count": 1.0, |
| "routers_loss": 2.6635637283325195, |
| "skip_count": 2.0, |
| "step": 14, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.08737201365187713, |
| "f1_execute": 0.6341463327407837, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 40.25, |
| "learning_rate": 3e-05, |
| "loss": 3.2606, |
| "macro_f1": 0.21138212084770203, |
| "num_tokens": 26754.0, |
| "repeat_count": 0.0, |
| "routers_loss": 1.967104196548462, |
| "skip_count": 0.0, |
| "step": 16, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 25.0, |
| "epoch": 0.09829351535836177, |
| "f1_execute": 0.5405405163764954, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.1666666567325592, |
| "grad_norm": 39.5, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 2.9096, |
| "macro_f1": 0.23573574423789978, |
| "num_tokens": 29878.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.6965824365615845, |
| "skip_count": 2.0, |
| "step": 18, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.10921501706484642, |
| "f1_execute": 0.6666666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 40.75, |
| "learning_rate": 3.8e-05, |
| "loss": 3.2996, |
| "macro_f1": 0.222222238779068, |
| "num_tokens": 32410.0, |
| "repeat_count": 0.0, |
| "routers_loss": 7.038887977600098, |
| "skip_count": 0.0, |
| "step": 20, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.12013651877133105, |
| "f1_execute": 0.5641025900840759, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 32.5, |
| "learning_rate": 4.2000000000000004e-05, |
| "loss": 2.7437, |
| "macro_f1": 0.18803420662879944, |
| "num_tokens": 35122.0, |
| "repeat_count": 1.0, |
| "routers_loss": 4.3931450843811035, |
| "skip_count": 2.0, |
| "step": 22, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.1310580204778157, |
| "f1_execute": 0.6341463327407837, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 44.0, |
| "learning_rate": 4.6e-05, |
| "loss": 2.9583, |
| "macro_f1": 0.21138212084770203, |
| "num_tokens": 38647.0, |
| "repeat_count": 0.0, |
| "routers_loss": 5.246743202209473, |
| "skip_count": 2.0, |
| "step": 24, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 0.14197952218430035, |
| "f1_execute": 0.6666666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 36.0, |
| "learning_rate": 5e-05, |
| "loss": 2.0258, |
| "macro_f1": 0.222222238779068, |
| "num_tokens": 41759.0, |
| "repeat_count": 0.0, |
| "routers_loss": 4.385664463043213, |
| "skip_count": 0.0, |
| "step": 26, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.15290102389078497, |
| "f1_execute": 0.6666666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 22.75, |
| "learning_rate": 5.4e-05, |
| "loss": 1.8932, |
| "macro_f1": 0.222222238779068, |
| "num_tokens": 45255.0, |
| "repeat_count": 1.0, |
| "routers_loss": 2.442974090576172, |
| "skip_count": 2.0, |
| "step": 28, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.16382252559726962, |
| "f1_execute": 0.7272726893424988, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 20.5, |
| "learning_rate": 5.800000000000001e-05, |
| "loss": 1.5961, |
| "macro_f1": 0.24242423474788666, |
| "num_tokens": 48765.0, |
| "repeat_count": 0.0, |
| "routers_loss": 1.319467306137085, |
| "skip_count": 3.0, |
| "step": 30, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.17474402730375427, |
| "f1_execute": 0.782608687877655, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 21.875, |
| "learning_rate": 6.2e-05, |
| "loss": 1.7529, |
| "macro_f1": 0.260869562625885, |
| "num_tokens": 51973.0, |
| "repeat_count": 0.0, |
| "routers_loss": 1.2047386169433594, |
| "skip_count": 2.0, |
| "step": 32, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.18566552901023892, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 26.875, |
| "learning_rate": 6.6e-05, |
| "loss": 1.4983, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 54972.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.8216792345046997, |
| "skip_count": 0.0, |
| "step": 34, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.19658703071672354, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 20.75, |
| "learning_rate": 7.000000000000001e-05, |
| "loss": 1.2751, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 58134.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.6534898281097412, |
| "skip_count": 0.0, |
| "step": 36, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.2075085324232082, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 17.75, |
| "learning_rate": 7.4e-05, |
| "loss": 0.9561, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 61291.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.6772168278694153, |
| "skip_count": 2.0, |
| "step": 38, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.21843003412969283, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 15.875, |
| "learning_rate": 7.8e-05, |
| "loss": 0.6809, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 64406.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.7885609865188599, |
| "skip_count": 1.0, |
| "step": 40, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.22935153583617748, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 17.0, |
| "learning_rate": 8.2e-05, |
| "loss": 0.587, |
| "macro_f1": 0.3205128312110901, |
| "num_tokens": 67402.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.31721553206443787, |
| "skip_count": 0.0, |
| "step": 42, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.2402730375426621, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 12.625, |
| "learning_rate": 8.599999999999999e-05, |
| "loss": 0.4996, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 70935.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.13094936311244965, |
| "skip_count": 0.0, |
| "step": 44, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.25119453924914675, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 12.5625, |
| "learning_rate": 8.999999999999999e-05, |
| "loss": 0.4226, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 73716.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.48597365617752075, |
| "skip_count": 3.0, |
| "step": 46, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.2621160409556314, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.1875, |
| "learning_rate": 9.400000000000001e-05, |
| "loss": 0.2499, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 76662.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.7850716710090637, |
| "skip_count": 1.0, |
| "step": 48, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.27303754266211605, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.5625, |
| "learning_rate": 9.800000000000001e-05, |
| "loss": 0.3029, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 80080.0, |
| "repeat_count": 2.0, |
| "routers_loss": 1.4728330373764038, |
| "skip_count": 1.0, |
| "step": 50, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.2839590443686007, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.000102, |
| "loss": 0.2549, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 82942.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.16784702241420746, |
| "skip_count": 2.0, |
| "step": 52, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.29488054607508535, |
| "f1_execute": 0.8571428060531616, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.5625, |
| "learning_rate": 0.000106, |
| "loss": 0.2782, |
| "macro_f1": 0.2857142686843872, |
| "num_tokens": 85928.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.25518977642059326, |
| "skip_count": 4.0, |
| "step": 54, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.30580204778156994, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.75, |
| "learning_rate": 0.00011, |
| "loss": 0.2309, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 88804.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.21613653004169464, |
| "skip_count": 3.0, |
| "step": 56, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.3167235494880546, |
| "f1_execute": 0.8571429252624512, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.625, |
| "learning_rate": 0.000114, |
| "loss": 0.1319, |
| "macro_f1": 0.285714328289032, |
| "num_tokens": 91674.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.4971294403076172, |
| "skip_count": 5.0, |
| "step": 58, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.32764505119453924, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.625, |
| "learning_rate": 0.000118, |
| "loss": 0.1637, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 94858.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.01838197372853756, |
| "skip_count": 0.0, |
| "step": 60, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.3385665529010239, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.859375, |
| "learning_rate": 0.000122, |
| "loss": 0.1888, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 97538.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.5383598804473877, |
| "skip_count": 1.0, |
| "step": 62, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 31.0, |
| "epoch": 0.34948805460750854, |
| "f1_execute": 0.8571428060531616, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5, |
| "learning_rate": 0.000126, |
| "loss": 0.2176, |
| "macro_f1": 0.2857142686843872, |
| "num_tokens": 101249.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.2093856781721115, |
| "skip_count": 1.0, |
| "step": 64, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.3604095563139932, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.625, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.1568, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 104398.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.015723152086138725, |
| "skip_count": 0.0, |
| "step": 66, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.37133105802047783, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.34375, |
| "learning_rate": 0.000134, |
| "loss": 0.2764, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 107538.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.019146224483847618, |
| "skip_count": 0.0, |
| "step": 68, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.3822525597269625, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.25, |
| "learning_rate": 0.00013800000000000002, |
| "loss": 0.2035, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 110689.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.6408394575119019, |
| "skip_count": 0.0, |
| "step": 70, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.3931740614334471, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.6875, |
| "learning_rate": 0.00014199999999999998, |
| "loss": 0.1986, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 114205.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.04342689737677574, |
| "skip_count": 0.0, |
| "step": 72, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.4040955631399317, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.000146, |
| "loss": 0.1412, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 117140.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12777170538902283, |
| "skip_count": 1.0, |
| "step": 74, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.4150170648464164, |
| "f1_execute": 0.8571428060531616, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.5, |
| "learning_rate": 0.00015, |
| "loss": 0.1273, |
| "macro_f1": 0.2857142686843872, |
| "num_tokens": 120355.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2570268511772156, |
| "skip_count": 5.0, |
| "step": 76, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.425938566552901, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.03125, |
| "learning_rate": 0.000154, |
| "loss": 0.1169, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 123542.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.019178830087184906, |
| "skip_count": 0.0, |
| "step": 78, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.43686006825938567, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5, |
| "learning_rate": 0.000158, |
| "loss": 0.1702, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 126444.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.40678197145462036, |
| "skip_count": 4.0, |
| "step": 80, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.4477815699658703, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.40625, |
| "learning_rate": 0.000162, |
| "loss": 0.207, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 129208.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.016020173206925392, |
| "skip_count": 0.0, |
| "step": 82, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.45870307167235497, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.0, |
| "learning_rate": 0.00016600000000000002, |
| "loss": 0.1469, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 132692.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.015191584825515747, |
| "skip_count": 0.0, |
| "step": 84, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.4696245733788396, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.9375, |
| "learning_rate": 0.00017, |
| "loss": 0.1883, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 135433.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.29757800698280334, |
| "skip_count": 2.0, |
| "step": 86, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.1111111119389534, |
| "avg_layers": 27.0, |
| "epoch": 0.4805460750853242, |
| "f1_execute": 0.7142857313156128, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.1818181872367859, |
| "grad_norm": 4.21875, |
| "learning_rate": 0.000174, |
| "loss": 0.2656, |
| "macro_f1": 0.29870131611824036, |
| "num_tokens": 139019.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.5406635403633118, |
| "skip_count": 9.0, |
| "step": 88, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.49146757679180886, |
| "f1_execute": 0.8571429252624512, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.000178, |
| "loss": 0.2149, |
| "macro_f1": 0.285714328289032, |
| "num_tokens": 142156.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.9084331393241882, |
| "skip_count": 3.0, |
| "step": 90, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 25.0, |
| "epoch": 0.5023890784982935, |
| "f1_execute": 0.8979592323303223, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 6.15625, |
| "learning_rate": 0.000182, |
| "loss": 0.1461, |
| "macro_f1": 0.4104308784008026, |
| "num_tokens": 144866.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.298293799161911, |
| "skip_count": 3.0, |
| "step": 92, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.5133105802047782, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.000186, |
| "loss": 0.1432, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 148029.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.13971005380153656, |
| "skip_count": 1.0, |
| "step": 94, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.5242320819112628, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.53125, |
| "learning_rate": 0.00019, |
| "loss": 0.1566, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 151076.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2203323394060135, |
| "skip_count": 2.0, |
| "step": 96, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.5351535836177475, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 10.25, |
| "learning_rate": 0.000194, |
| "loss": 0.3221, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 153825.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.22957128286361694, |
| "skip_count": 2.0, |
| "step": 98, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 0.5460750853242321, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 0.1445, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 157200.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0985352173447609, |
| "skip_count": 0.0, |
| "step": 100, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.5569965870307167, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.125, |
| "learning_rate": 0.000202, |
| "loss": 0.2346, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 161171.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.5728805065155029, |
| "skip_count": 2.0, |
| "step": 102, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 0.5679180887372014, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 4.65625, |
| "learning_rate": 0.000206, |
| "loss": 0.1532, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 165319.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08763546496629715, |
| "skip_count": 2.0, |
| "step": 104, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.578839590443686, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.375, |
| "learning_rate": 0.00021, |
| "loss": 0.1183, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 168259.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.11700262129306793, |
| "skip_count": 1.0, |
| "step": 106, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.5897610921501707, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.1875, |
| "learning_rate": 0.000214, |
| "loss": 0.1856, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 171640.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.2897156774997711, |
| "skip_count": 2.0, |
| "step": 108, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.6006825938566553, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.000218, |
| "loss": 0.1379, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 174452.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.20764203369617462, |
| "skip_count": 4.0, |
| "step": 110, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.6116040955631399, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.9375, |
| "learning_rate": 0.000222, |
| "loss": 0.14, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 177034.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.07773401588201523, |
| "skip_count": 0.0, |
| "step": 112, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.6225255972696245, |
| "f1_execute": 0.8571428656578064, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.375, |
| "learning_rate": 0.00022600000000000002, |
| "loss": 0.1327, |
| "macro_f1": 0.2857142984867096, |
| "num_tokens": 180310.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.3696478605270386, |
| "skip_count": 2.0, |
| "step": 114, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.6334470989761092, |
| "f1_execute": 0.8333333730697632, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.984375, |
| "learning_rate": 0.00023, |
| "loss": 0.155, |
| "macro_f1": 0.2777777910232544, |
| "num_tokens": 182835.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.5024136304855347, |
| "skip_count": 5.0, |
| "step": 116, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.6443686006825938, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.00023400000000000002, |
| "loss": 0.1566, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 186508.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02631981112062931, |
| "skip_count": 0.0, |
| "step": 118, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.6552901023890785, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.03125, |
| "learning_rate": 0.00023799999999999998, |
| "loss": 0.1503, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 190380.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.036612559109926224, |
| "skip_count": 0.0, |
| "step": 120, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.6662116040955631, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.28125, |
| "learning_rate": 0.000242, |
| "loss": 0.181, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 193279.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.37753066420555115, |
| "skip_count": 1.0, |
| "step": 122, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.6771331058020478, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.75, |
| "learning_rate": 0.000246, |
| "loss": 0.1187, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 196711.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08419940620660782, |
| "skip_count": 1.0, |
| "step": 124, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 0.6880546075085324, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.0, |
| "learning_rate": 0.00025, |
| "loss": 0.1184, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 199715.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.043020736426115036, |
| "skip_count": 2.0, |
| "step": 126, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.6989761092150171, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.9375, |
| "learning_rate": 0.000254, |
| "loss": 0.1421, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 204217.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0802314504981041, |
| "skip_count": 1.0, |
| "step": 128, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.7098976109215017, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.0, |
| "learning_rate": 0.00025800000000000004, |
| "loss": 0.1719, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 206777.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.09076520055532455, |
| "skip_count": 1.0, |
| "step": 130, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.7208191126279864, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.9375, |
| "learning_rate": 0.000262, |
| "loss": 0.1423, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 210838.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.024340573698282242, |
| "skip_count": 0.0, |
| "step": 132, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.731740614334471, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.875, |
| "learning_rate": 0.000266, |
| "loss": 0.1, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 213498.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.016322199255228043, |
| "skip_count": 0.0, |
| "step": 134, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.7426621160409557, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.34375, |
| "learning_rate": 0.00027, |
| "loss": 0.1408, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 216998.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.042806077748537064, |
| "skip_count": 1.0, |
| "step": 136, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.7535836177474403, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.00027400000000000005, |
| "loss": 0.1012, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 219952.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12166574597358704, |
| "skip_count": 2.0, |
| "step": 138, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.764505119453925, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.59375, |
| "learning_rate": 0.00027800000000000004, |
| "loss": 0.1576, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 223326.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12389889359474182, |
| "skip_count": 1.0, |
| "step": 140, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.7754266211604095, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.46875, |
| "learning_rate": 0.00028199999999999997, |
| "loss": 0.1554, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 226179.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1315135806798935, |
| "skip_count": 2.0, |
| "step": 142, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.7863481228668942, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.625, |
| "learning_rate": 0.00028599999999999996, |
| "loss": 0.1188, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 228782.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08095238357782364, |
| "skip_count": 1.0, |
| "step": 144, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.7972696245733788, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.5, |
| "learning_rate": 0.00029, |
| "loss": 0.1616, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 231771.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.13997994363307953, |
| "skip_count": 4.0, |
| "step": 146, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.8081911262798634, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.0, |
| "learning_rate": 0.000294, |
| "loss": 0.1868, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 234517.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03245344012975693, |
| "skip_count": 0.0, |
| "step": 148, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 0.8191126279863481, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.375, |
| "learning_rate": 0.000298, |
| "loss": 0.148, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 237324.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.36887046694755554, |
| "skip_count": 2.0, |
| "step": 150, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.8300341296928327, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.000302, |
| "loss": 0.1759, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 240657.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1363309770822525, |
| "skip_count": 0.0, |
| "step": 152, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.8409556313993174, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.65625, |
| "learning_rate": 0.000306, |
| "loss": 0.2043, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 243741.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.024881718680262566, |
| "skip_count": 0.0, |
| "step": 154, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 0.851877133105802, |
| "f1_execute": 0.8979592323303223, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 4.5625, |
| "learning_rate": 0.00031, |
| "loss": 0.1777, |
| "macro_f1": 0.4326530694961548, |
| "num_tokens": 246879.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.25227662920951843, |
| "skip_count": 3.0, |
| "step": 156, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 0.8627986348122867, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.000314, |
| "loss": 0.1641, |
| "macro_f1": 0.47333335876464844, |
| "num_tokens": 249880.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.3088915944099426, |
| "skip_count": 3.0, |
| "step": 158, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 25.0, |
| "epoch": 0.8737201365187713, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 6.59375, |
| "learning_rate": 0.00031800000000000003, |
| "loss": 0.1687, |
| "macro_f1": 0.41777777671813965, |
| "num_tokens": 252725.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.11272747814655304, |
| "skip_count": 3.0, |
| "step": 160, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 0.884641638225256, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.90625, |
| "learning_rate": 0.000322, |
| "loss": 0.1408, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 255951.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05064187943935394, |
| "skip_count": 0.0, |
| "step": 162, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.8955631399317406, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.65625, |
| "learning_rate": 0.000326, |
| "loss": 0.1509, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 259469.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.21262036263942719, |
| "skip_count": 2.0, |
| "step": 164, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 26.0, |
| "epoch": 0.9064846416382253, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 5.25, |
| "learning_rate": 0.00033, |
| "loss": 0.1578, |
| "macro_f1": 0.4400000274181366, |
| "num_tokens": 262272.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1725386530160904, |
| "skip_count": 3.0, |
| "step": 166, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 0.9174061433447099, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.4375, |
| "learning_rate": 0.00033400000000000004, |
| "loss": 0.1471, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 266415.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02629087306559086, |
| "skip_count": 0.0, |
| "step": 168, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.9283276450511946, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.00033800000000000003, |
| "loss": 0.1185, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 269700.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05510875955224037, |
| "skip_count": 1.0, |
| "step": 170, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.9392491467576792, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.75, |
| "learning_rate": 0.000342, |
| "loss": 0.1637, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 272587.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.27733829617500305, |
| "skip_count": 3.0, |
| "step": 172, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 0.9501706484641638, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.4375, |
| "learning_rate": 0.000346, |
| "loss": 0.2034, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 277005.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.14457301795482635, |
| "skip_count": 2.0, |
| "step": 174, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 0.9610921501706484, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 9.125, |
| "learning_rate": 0.00035, |
| "loss": 0.154, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 279607.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.07571296393871307, |
| "skip_count": 2.0, |
| "step": 176, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 0.9720136518771331, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.000354, |
| "loss": 0.1894, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 282547.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.5549371838569641, |
| "skip_count": 0.0, |
| "step": 178, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 0.9829351535836177, |
| "f1_execute": 0.9411765336990356, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 4.9375, |
| "learning_rate": 0.000358, |
| "loss": 0.1226, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 286081.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.2509016990661621, |
| "skip_count": 2.0, |
| "step": 180, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 0.9938566552901024, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.9375, |
| "learning_rate": 0.000362, |
| "loss": 0.1795, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 289224.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.017457736656069756, |
| "skip_count": 0.0, |
| "step": 182, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.0, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.390625, |
| "learning_rate": 0.000366, |
| "loss": 0.1471, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 290916.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05112108215689659, |
| "skip_count": 0.0, |
| "step": 184, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.0109215017064845, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.6875, |
| "learning_rate": 0.00037, |
| "loss": 0.1459, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 294182.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.5592358708381653, |
| "skip_count": 1.0, |
| "step": 186, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.0218430034129693, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.890625, |
| "learning_rate": 0.000374, |
| "loss": 0.1446, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 296702.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.006012737285345793, |
| "skip_count": 0.0, |
| "step": 188, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.0327645051194538, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.96875, |
| "learning_rate": 0.000378, |
| "loss": 0.1394, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 300348.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06094537675380707, |
| "skip_count": 2.0, |
| "step": 190, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.0436860068259386, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.625, |
| "learning_rate": 0.000382, |
| "loss": 0.0995, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 303466.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08475696295499802, |
| "skip_count": 1.0, |
| "step": 192, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.0546075085324231, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.1875, |
| "learning_rate": 0.000386, |
| "loss": 0.1749, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 306160.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.010187637060880661, |
| "skip_count": 0.0, |
| "step": 194, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.065529010238908, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.28125, |
| "learning_rate": 0.00039000000000000005, |
| "loss": 0.1692, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 309453.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.20142780244350433, |
| "skip_count": 1.0, |
| "step": 196, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.0764505119453924, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.00039400000000000004, |
| "loss": 0.1283, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 312138.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.015577984042465687, |
| "skip_count": 0.0, |
| "step": 198, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 1.0873720136518772, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.46875, |
| "learning_rate": 0.000398, |
| "loss": 0.1061, |
| "macro_f1": 0.4803921580314636, |
| "num_tokens": 315833.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1465342938899994, |
| "skip_count": 2.0, |
| "step": 200, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.0982935153583617, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 10.5625, |
| "learning_rate": 0.000402, |
| "loss": 0.1879, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 318690.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.09964372962713242, |
| "skip_count": 0.0, |
| "step": 202, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 30.0, |
| "epoch": 1.1092150170648465, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.25, |
| "learning_rate": 0.00040600000000000006, |
| "loss": 0.1226, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 322294.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.030282732099294662, |
| "skip_count": 0.0, |
| "step": 204, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.120136518771331, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.125, |
| "learning_rate": 0.00041, |
| "loss": 0.1582, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 325029.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.24788229167461395, |
| "skip_count": 1.0, |
| "step": 206, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 1.1310580204778158, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 5.9375, |
| "learning_rate": 0.000414, |
| "loss": 0.2048, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 328178.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.031264692544937134, |
| "skip_count": 1.0, |
| "step": 208, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 25.0, |
| "epoch": 1.1419795221843003, |
| "f1_execute": 0.9166666269302368, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5714285373687744, |
| "grad_norm": 6.8125, |
| "learning_rate": 0.00041799999999999997, |
| "loss": 0.1756, |
| "macro_f1": 0.4960317313671112, |
| "num_tokens": 331351.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.343823105096817, |
| "skip_count": 4.0, |
| "step": 210, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.1529010238907849, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.84375, |
| "learning_rate": 0.000422, |
| "loss": 0.1246, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 335297.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.014860679395496845, |
| "skip_count": 0.0, |
| "step": 212, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.1638225255972696, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.46875, |
| "learning_rate": 0.000426, |
| "loss": 0.1537, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 338427.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.33231568336486816, |
| "skip_count": 3.0, |
| "step": 214, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.1747440273037544, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.59375, |
| "learning_rate": 0.00043, |
| "loss": 0.1546, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 341158.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.007448212709277868, |
| "skip_count": 0.0, |
| "step": 216, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.185665529010239, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.00043400000000000003, |
| "loss": 0.1468, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 344329.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02311822399497032, |
| "skip_count": 0.0, |
| "step": 218, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.1965870307167235, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.15625, |
| "learning_rate": 0.000438, |
| "loss": 0.1307, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 348948.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02867077849805355, |
| "skip_count": 1.0, |
| "step": 220, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 1.2075085324232082, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.000442, |
| "loss": 0.2046, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 351741.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03160649910569191, |
| "skip_count": 2.0, |
| "step": 222, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.2184300341296928, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5, |
| "learning_rate": 0.000446, |
| "loss": 0.2074, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 354852.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1611160784959793, |
| "skip_count": 0.0, |
| "step": 224, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 29.0, |
| "epoch": 1.2293515358361775, |
| "f1_execute": 0.8695651888847351, |
| "f1_repeat": 0.4000000059604645, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 3.328125, |
| "learning_rate": 0.00045000000000000004, |
| "loss": 0.118, |
| "macro_f1": 0.5565217733383179, |
| "num_tokens": 357431.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.7632720470428467, |
| "skip_count": 3.0, |
| "step": 226, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.240273037542662, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.59375, |
| "learning_rate": 0.00045400000000000003, |
| "loss": 0.0965, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 360192.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08349918574094772, |
| "skip_count": 1.0, |
| "step": 228, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 1.2511945392491468, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 9.9375, |
| "learning_rate": 0.000458, |
| "loss": 0.1714, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 363209.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06626693904399872, |
| "skip_count": 2.0, |
| "step": 230, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.2621160409556313, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.25, |
| "learning_rate": 0.000462, |
| "loss": 0.1859, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 368262.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03743857145309448, |
| "skip_count": 0.0, |
| "step": 232, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.273037542662116, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.34375, |
| "learning_rate": 0.00046600000000000005, |
| "loss": 0.2281, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 370737.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.12340149283409119, |
| "skip_count": 0.0, |
| "step": 234, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.2839590443686006, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.8125, |
| "learning_rate": 0.00047, |
| "loss": 0.1535, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 373272.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.04501926526427269, |
| "skip_count": 0.0, |
| "step": 236, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.2948805460750854, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.625, |
| "learning_rate": 0.000474, |
| "loss": 0.1701, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 376924.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3543643057346344, |
| "skip_count": 1.0, |
| "step": 238, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.25, |
| "avg_layers": 27.0, |
| "epoch": 1.30580204778157, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.00047799999999999996, |
| "loss": 0.1553, |
| "macro_f1": 0.4400000274181366, |
| "num_tokens": 380034.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1332877278327942, |
| "skip_count": 4.0, |
| "step": 240, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.3167235494880547, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.125, |
| "learning_rate": 0.000482, |
| "loss": 0.0874, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 382846.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.013933669775724411, |
| "skip_count": 0.0, |
| "step": 242, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.3276450511945392, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.765625, |
| "learning_rate": 0.000486, |
| "loss": 0.1505, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 385916.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.11566327512264252, |
| "skip_count": 1.0, |
| "step": 244, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.3385665529010238, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.9375, |
| "learning_rate": 0.00049, |
| "loss": 0.1634, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 388768.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.015394577756524086, |
| "skip_count": 0.0, |
| "step": 246, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.3494880546075085, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.000494, |
| "loss": 0.1493, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 391699.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05529753863811493, |
| "skip_count": 0.0, |
| "step": 248, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.3604095563139933, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.75, |
| "learning_rate": 0.000498, |
| "loss": 0.2545, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 395380.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.15498189628124237, |
| "skip_count": 1.0, |
| "step": 250, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.3713310580204778, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.8125, |
| "learning_rate": 0.0005020000000000001, |
| "loss": 0.1998, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 398414.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.053408559411764145, |
| "skip_count": 2.0, |
| "step": 252, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.3822525597269624, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.4375, |
| "learning_rate": 0.000506, |
| "loss": 0.1761, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 401690.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15143637359142303, |
| "skip_count": 1.0, |
| "step": 254, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.3931740614334471, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.796875, |
| "learning_rate": 0.00051, |
| "loss": 0.1638, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 404533.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.036931805312633514, |
| "skip_count": 1.0, |
| "step": 256, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 1.4040955631399317, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 7.21875, |
| "learning_rate": 0.000514, |
| "loss": 0.1765, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 408175.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.16898785531520844, |
| "skip_count": 2.0, |
| "step": 258, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 1.4150170648464164, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.875, |
| "learning_rate": 0.000518, |
| "loss": 0.2172, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 411160.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05883602425456047, |
| "skip_count": 1.0, |
| "step": 260, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.425938566552901, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.453125, |
| "learning_rate": 0.000522, |
| "loss": 0.1121, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 414391.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.14810606837272644, |
| "skip_count": 2.0, |
| "step": 262, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.4368600682593857, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.000526, |
| "loss": 0.1772, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 417763.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.20452100038528442, |
| "skip_count": 0.0, |
| "step": 264, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 26.0, |
| "epoch": 1.4477815699658703, |
| "f1_execute": 0.8979591727256775, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 3.5, |
| "learning_rate": 0.0005300000000000001, |
| "loss": 0.1446, |
| "macro_f1": 0.4326530694961548, |
| "num_tokens": 421881.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.32300108671188354, |
| "skip_count": 3.0, |
| "step": 266, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.20000000298023224, |
| "avg_layers": 27.0, |
| "epoch": 1.458703071672355, |
| "f1_execute": 0.8260869383811951, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.2857142984867096, |
| "grad_norm": 3.96875, |
| "learning_rate": 0.0005340000000000001, |
| "loss": 0.1377, |
| "macro_f1": 0.3706004321575165, |
| "num_tokens": 424938.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.5530142784118652, |
| "skip_count": 5.0, |
| "step": 268, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.4696245733788396, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.5625, |
| "learning_rate": 0.0005380000000000001, |
| "loss": 0.1457, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 427555.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.10682675242424011, |
| "skip_count": 3.0, |
| "step": 270, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.480546075085324, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.0005420000000000001, |
| "loss": 0.174, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 430168.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.9753395318984985, |
| "skip_count": 2.0, |
| "step": 272, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.4914675767918089, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.75, |
| "learning_rate": 0.000546, |
| "loss": 0.1441, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 433358.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.021224403753876686, |
| "skip_count": 0.0, |
| "step": 274, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.5023890784982936, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.78125, |
| "learning_rate": 0.00055, |
| "loss": 0.1624, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 436460.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08185791224241257, |
| "skip_count": 2.0, |
| "step": 276, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 1.5133105802047782, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.40625, |
| "learning_rate": 0.000554, |
| "loss": 0.1677, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 439531.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.037240445613861084, |
| "skip_count": 0.0, |
| "step": 278, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.5242320819112627, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.765625, |
| "learning_rate": 0.000558, |
| "loss": 0.2688, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 442521.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3406132459640503, |
| "skip_count": 3.0, |
| "step": 280, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.5351535836177475, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.5625, |
| "learning_rate": 0.0005620000000000001, |
| "loss": 0.0875, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 444942.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.006758399773389101, |
| "skip_count": 0.0, |
| "step": 282, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.5460750853242322, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.000566, |
| "loss": 0.1597, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 448193.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06801790744066238, |
| "skip_count": 0.0, |
| "step": 284, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.20000000298023224, |
| "avg_layers": 27.0, |
| "epoch": 1.5569965870307167, |
| "f1_execute": 0.8510637879371643, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333134651184, |
| "grad_norm": 4.78125, |
| "learning_rate": 0.00057, |
| "loss": 0.2027, |
| "macro_f1": 0.39479905366897583, |
| "num_tokens": 451293.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.23832914233207703, |
| "skip_count": 5.0, |
| "step": 286, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.5679180887372013, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.40625, |
| "learning_rate": 0.000574, |
| "loss": 0.1361, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 454069.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.14267782866954803, |
| "skip_count": 0.0, |
| "step": 288, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.578839590443686, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.65625, |
| "learning_rate": 0.000578, |
| "loss": 0.1921, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 457308.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.3219856917858124, |
| "skip_count": 2.0, |
| "step": 290, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.5897610921501708, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.90625, |
| "learning_rate": 0.0005819999999999999, |
| "loss": 0.2214, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 460138.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.4478992521762848, |
| "skip_count": 1.0, |
| "step": 292, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.6006825938566553, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.9375, |
| "learning_rate": 0.0005859999999999999, |
| "loss": 0.2102, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 464029.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.019972749054431915, |
| "skip_count": 0.0, |
| "step": 294, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.6116040955631399, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5, |
| "learning_rate": 0.00059, |
| "loss": 0.1164, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 467500.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.14752870798110962, |
| "skip_count": 3.0, |
| "step": 296, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.6225255972696244, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.75, |
| "learning_rate": 0.000594, |
| "loss": 0.1434, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 470734.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.30419600009918213, |
| "skip_count": 1.0, |
| "step": 298, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.6334470989761092, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.375, |
| "learning_rate": 0.000598, |
| "loss": 0.2077, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 474514.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06921514868736267, |
| "skip_count": 2.0, |
| "step": 300, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.644368600682594, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.125, |
| "learning_rate": 0.000602, |
| "loss": 0.1566, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 477393.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2468976378440857, |
| "skip_count": 2.0, |
| "step": 302, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.6552901023890785, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.375, |
| "learning_rate": 0.000606, |
| "loss": 0.1649, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 480381.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.020447812974452972, |
| "skip_count": 0.0, |
| "step": 304, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.666211604095563, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.5, |
| "learning_rate": 0.00061, |
| "loss": 0.1423, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 483502.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05023586004972458, |
| "skip_count": 1.0, |
| "step": 306, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 1.6771331058020478, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.0, |
| "learning_rate": 0.000614, |
| "loss": 0.2042, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 488006.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.049936871975660324, |
| "skip_count": 0.0, |
| "step": 308, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.6880546075085325, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.0006180000000000001, |
| "loss": 0.2121, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 491611.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.20010031759738922, |
| "skip_count": 0.0, |
| "step": 310, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.698976109215017, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.09375, |
| "learning_rate": 0.000622, |
| "loss": 0.2415, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 494903.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.01630268059670925, |
| "skip_count": 0.0, |
| "step": 312, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.7098976109215016, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.000626, |
| "loss": 0.2042, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 497949.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2674679160118103, |
| "skip_count": 1.0, |
| "step": 314, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 1.7208191126279864, |
| "f1_execute": 0.9803921580314636, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.5, |
| "learning_rate": 0.00063, |
| "loss": 0.1844, |
| "macro_f1": 0.8823530077934265, |
| "num_tokens": 501082.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1621737778186798, |
| "skip_count": 2.0, |
| "step": 316, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.7317406143344711, |
| "f1_execute": 0.8979592323303223, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 9.125, |
| "learning_rate": 0.000634, |
| "loss": 0.1708, |
| "macro_f1": 0.5215420126914978, |
| "num_tokens": 504131.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.6877225041389465, |
| "skip_count": 2.0, |
| "step": 318, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 1.7426621160409557, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.6875, |
| "learning_rate": 0.000638, |
| "loss": 0.1874, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 507012.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.14521881937980652, |
| "skip_count": 2.0, |
| "step": 320, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 23.0, |
| "epoch": 1.7535836177474402, |
| "f1_execute": 0.8936170339584351, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.444444477558136, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.000642, |
| "loss": 0.1489, |
| "macro_f1": 0.44602054357528687, |
| "num_tokens": 509950.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15650968253612518, |
| "skip_count": 4.0, |
| "step": 322, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 1.764505119453925, |
| "f1_execute": 0.8333333730697632, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.96875, |
| "learning_rate": 0.000646, |
| "loss": 0.163, |
| "macro_f1": 0.2777777910232544, |
| "num_tokens": 512900.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.3924711048603058, |
| "skip_count": 3.0, |
| "step": 324, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 1.7754266211604095, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.546875, |
| "learning_rate": 0.0006500000000000001, |
| "loss": 0.1452, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 516233.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.038907092064619064, |
| "skip_count": 2.0, |
| "step": 326, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.7863481228668943, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.796875, |
| "learning_rate": 0.0006540000000000001, |
| "loss": 0.1641, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 519636.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0022514634765684605, |
| "skip_count": 0.0, |
| "step": 328, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 1.7972696245733788, |
| "f1_execute": 0.9166666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 7.03125, |
| "learning_rate": 0.0006580000000000001, |
| "loss": 0.2761, |
| "macro_f1": 0.4722222685813904, |
| "num_tokens": 522992.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.4415050148963928, |
| "skip_count": 2.0, |
| "step": 330, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.8081911262798633, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.4375, |
| "learning_rate": 0.000662, |
| "loss": 0.1657, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 526843.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06788615882396698, |
| "skip_count": 1.0, |
| "step": 332, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.819112627986348, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.78125, |
| "learning_rate": 0.000666, |
| "loss": 0.1996, |
| "macro_f1": 0.6603773832321167, |
| "num_tokens": 530177.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.06985973566770554, |
| "skip_count": 1.0, |
| "step": 334, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.8300341296928329, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.46875, |
| "learning_rate": 0.00067, |
| "loss": 0.1877, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 533183.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.33230671286582947, |
| "skip_count": 2.0, |
| "step": 336, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.8409556313993174, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.6875, |
| "learning_rate": 0.000674, |
| "loss": 0.1249, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 536858.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15104004740715027, |
| "skip_count": 2.0, |
| "step": 338, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.851877133105802, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.625, |
| "learning_rate": 0.0006780000000000001, |
| "loss": 0.1885, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 540769.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.032123174518346786, |
| "skip_count": 0.0, |
| "step": 340, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.8627986348122867, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.0006820000000000001, |
| "loss": 0.1809, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 543783.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05651572719216347, |
| "skip_count": 1.0, |
| "step": 342, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.8737201365187715, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.0006860000000000001, |
| "loss": 0.1804, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 547125.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.13617995381355286, |
| "skip_count": 2.0, |
| "step": 344, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.884641638225256, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.65625, |
| "learning_rate": 0.00069, |
| "loss": 0.204, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 550591.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.023369189351797104, |
| "skip_count": 0.0, |
| "step": 346, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.8955631399317405, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.625, |
| "learning_rate": 0.000694, |
| "loss": 0.2275, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 553785.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.09765879064798355, |
| "skip_count": 1.0, |
| "step": 348, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.9064846416382253, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 10.5, |
| "learning_rate": 0.0006979999999999999, |
| "loss": 0.4191, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 556135.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.011158714070916176, |
| "skip_count": 0.0, |
| "step": 350, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.91740614334471, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.53125, |
| "learning_rate": 0.0007019999999999999, |
| "loss": 0.1557, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 558980.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.036593515425920486, |
| "skip_count": 0.0, |
| "step": 352, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.25, |
| "avg_layers": 26.0, |
| "epoch": 1.9283276450511946, |
| "f1_execute": 0.8979591727256775, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 4.1875, |
| "learning_rate": 0.0007059999999999999, |
| "loss": 0.183, |
| "macro_f1": 0.4104308485984802, |
| "num_tokens": 562187.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.48064568638801575, |
| "skip_count": 4.0, |
| "step": 354, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.9392491467576791, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.0, |
| "learning_rate": 0.00071, |
| "loss": 0.1982, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 565278.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.13826458156108856, |
| "skip_count": 1.0, |
| "step": 356, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 1.9501706484641637, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.90625, |
| "learning_rate": 0.000714, |
| "loss": 0.2709, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 567869.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.01589345932006836, |
| "skip_count": 0.0, |
| "step": 358, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 1.9610921501706484, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.09375, |
| "learning_rate": 0.000718, |
| "loss": 0.1902, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 571069.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.029062755405902863, |
| "skip_count": 0.0, |
| "step": 360, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 1.9720136518771332, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.125, |
| "learning_rate": 0.000722, |
| "loss": 0.2125, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 573838.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3241157531738281, |
| "skip_count": 1.0, |
| "step": 362, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 1.9829351535836177, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.46875, |
| "learning_rate": 0.000726, |
| "loss": 0.2176, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 576554.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03469887003302574, |
| "skip_count": 0.0, |
| "step": 364, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 1.9938566552901023, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 7.34375, |
| "learning_rate": 0.00073, |
| "loss": 0.182, |
| "macro_f1": 0.4803921580314636, |
| "num_tokens": 579653.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.11800751090049744, |
| "skip_count": 1.0, |
| "step": 366, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.0, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 15.125, |
| "learning_rate": 0.000734, |
| "loss": 0.3307, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 581832.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.014465595595538616, |
| "skip_count": 0.0, |
| "step": 368, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 2.0109215017064845, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.375, |
| "learning_rate": 0.000738, |
| "loss": 0.1482, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 585207.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.030198052525520325, |
| "skip_count": 0.0, |
| "step": 370, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.021843003412969, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.78125, |
| "learning_rate": 0.000742, |
| "loss": 0.0906, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 588893.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.04226446524262428, |
| "skip_count": 1.0, |
| "step": 372, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.6666666865348816, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 2.032764505119454, |
| "f1_execute": 0.9777777791023254, |
| "f1_repeat": 0.800000011920929, |
| "f1_skip": 1.0, |
| "grad_norm": 8.0625, |
| "learning_rate": 0.000746, |
| "loss": 0.2092, |
| "macro_f1": 0.9259259104728699, |
| "num_tokens": 592246.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.05995782092213631, |
| "skip_count": 3.0, |
| "step": 374, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 2.0436860068259386, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.03125, |
| "learning_rate": 0.00075, |
| "loss": 0.1724, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 594777.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.14366891980171204, |
| "skip_count": 3.0, |
| "step": 376, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.054607508532423, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.000754, |
| "loss": 0.0803, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 597931.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0027963866014033556, |
| "skip_count": 0.0, |
| "step": 378, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 29.0, |
| "epoch": 2.0655290102389077, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 6.28125, |
| "learning_rate": 0.000758, |
| "loss": 0.2873, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 601227.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15012779831886292, |
| "skip_count": 2.0, |
| "step": 380, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 2.0764505119453927, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.000762, |
| "loss": 0.1602, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 604297.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.0708698183298111, |
| "skip_count": 1.0, |
| "step": 382, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.20000000298023224, |
| "avg_layers": 28.0, |
| "epoch": 2.087372013651877, |
| "f1_execute": 0.8510638475418091, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333134651184, |
| "grad_norm": 8.25, |
| "learning_rate": 0.0007660000000000001, |
| "loss": 0.1786, |
| "macro_f1": 0.3947990834712982, |
| "num_tokens": 607137.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.46035754680633545, |
| "skip_count": 5.0, |
| "step": 384, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 2.0982935153583617, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 9.8125, |
| "learning_rate": 0.0007700000000000001, |
| "loss": 0.1415, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 610067.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.04594701901078224, |
| "skip_count": 2.0, |
| "step": 386, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.25, |
| "avg_layers": 26.0, |
| "epoch": 2.1092150170648463, |
| "f1_execute": 0.9387754797935486, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.0007740000000000001, |
| "loss": 0.1453, |
| "macro_f1": 0.42403626441955566, |
| "num_tokens": 613020.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.21872307360172272, |
| "skip_count": 4.0, |
| "step": 388, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.1201365187713312, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.000778, |
| "loss": 0.2459, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 615777.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.17068128287792206, |
| "skip_count": 3.0, |
| "step": 390, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.131058020477816, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.40625, |
| "learning_rate": 0.000782, |
| "loss": 0.1734, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 618883.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06883871555328369, |
| "skip_count": 2.0, |
| "step": 392, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 2.1419795221843003, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 4.4375, |
| "learning_rate": 0.000786, |
| "loss": 0.1822, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 621785.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.021629702299833298, |
| "skip_count": 2.0, |
| "step": 394, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 2.152901023890785, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 10.4375, |
| "learning_rate": 0.00079, |
| "loss": 0.2188, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 624497.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02989846095442772, |
| "skip_count": 2.0, |
| "step": 396, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.1638225255972694, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.03125, |
| "learning_rate": 0.0007940000000000001, |
| "loss": 0.2, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 627530.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0030090075451880693, |
| "skip_count": 0.0, |
| "step": 398, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.1747440273037544, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.625, |
| "learning_rate": 0.0007980000000000001, |
| "loss": 0.1503, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 630816.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02026674523949623, |
| "skip_count": 0.0, |
| "step": 400, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.185665529010239, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.6875, |
| "learning_rate": 0.0008020000000000001, |
| "loss": 0.1285, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 633715.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.08777285367250443, |
| "skip_count": 0.0, |
| "step": 402, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.1965870307167235, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.59375, |
| "learning_rate": 0.0008060000000000001, |
| "loss": 0.186, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 636871.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.049915000796318054, |
| "skip_count": 1.0, |
| "step": 404, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.207508532423208, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.0008100000000000001, |
| "loss": 0.1592, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 639784.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05443386733531952, |
| "skip_count": 2.0, |
| "step": 406, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 2.218430034129693, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.90625, |
| "learning_rate": 0.0008139999999999999, |
| "loss": 0.1947, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 642682.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.021953796967864037, |
| "skip_count": 0.0, |
| "step": 408, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.2293515358361775, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.625, |
| "learning_rate": 0.0008179999999999999, |
| "loss": 0.2197, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 645962.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.010657553561031818, |
| "skip_count": 0.0, |
| "step": 410, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.240273037542662, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.421875, |
| "learning_rate": 0.0008219999999999999, |
| "loss": 0.2091, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 649180.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.013879667967557907, |
| "skip_count": 0.0, |
| "step": 412, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.2511945392491466, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.000826, |
| "loss": 0.1555, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 653015.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12807206809520721, |
| "skip_count": 2.0, |
| "step": 414, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.6666666865348816, |
| "avg_layers": 25.0, |
| "epoch": 2.2621160409556316, |
| "f1_execute": 0.9166666269302368, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.453125, |
| "learning_rate": 0.00083, |
| "loss": 0.1335, |
| "macro_f1": 0.5277777910232544, |
| "num_tokens": 655892.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.8250671625137329, |
| "skip_count": 3.0, |
| "step": 416, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.273037542662116, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 5.8125, |
| "learning_rate": 0.000834, |
| "loss": 0.1831, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 658426.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03139641508460045, |
| "skip_count": 2.0, |
| "step": 418, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.2839590443686006, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 3.40625, |
| "learning_rate": 0.000838, |
| "loss": 0.1345, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 661809.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.0441780611872673, |
| "skip_count": 0.0, |
| "step": 420, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.294880546075085, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.000842, |
| "loss": 0.1127, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 664874.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.44332680106163025, |
| "skip_count": 1.0, |
| "step": 422, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.3058020477815697, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.96875, |
| "learning_rate": 0.000846, |
| "loss": 0.1225, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 668325.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.059455983340740204, |
| "skip_count": 0.0, |
| "step": 424, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.3167235494880547, |
| "f1_execute": 0.9411765336990356, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 6.8125, |
| "learning_rate": 0.00085, |
| "loss": 0.1816, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 671097.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.3154633641242981, |
| "skip_count": 2.0, |
| "step": 426, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 25.0, |
| "epoch": 2.3276450511945392, |
| "f1_execute": 0.8979592323303223, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 5.65625, |
| "learning_rate": 0.000854, |
| "loss": 0.122, |
| "macro_f1": 0.4104308784008026, |
| "num_tokens": 674042.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.4580267667770386, |
| "skip_count": 3.0, |
| "step": 428, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.3385665529010238, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.21875, |
| "learning_rate": 0.000858, |
| "loss": 0.1113, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 677016.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.015222650021314621, |
| "skip_count": 0.0, |
| "step": 430, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.3494880546075088, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.78125, |
| "learning_rate": 0.000862, |
| "loss": 0.1379, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 679990.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.24279196560382843, |
| "skip_count": 0.0, |
| "step": 432, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.3604095563139933, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.28125, |
| "learning_rate": 0.000866, |
| "loss": 0.1476, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 682786.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1684337556362152, |
| "skip_count": 0.0, |
| "step": 434, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.371331058020478, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.0, |
| "learning_rate": 0.00087, |
| "loss": 0.1204, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 685882.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.19464725255966187, |
| "skip_count": 0.0, |
| "step": 436, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.3822525597269624, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.71875, |
| "learning_rate": 0.000874, |
| "loss": 0.1124, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 689570.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05968143790960312, |
| "skip_count": 2.0, |
| "step": 438, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.393174061433447, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.84375, |
| "learning_rate": 0.000878, |
| "loss": 0.1528, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 693559.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.004517437424510717, |
| "skip_count": 0.0, |
| "step": 440, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.404095563139932, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.125, |
| "learning_rate": 0.000882, |
| "loss": 0.1353, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 696374.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.26632770895957947, |
| "skip_count": 2.0, |
| "step": 442, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.4150170648464164, |
| "f1_execute": 0.8571428656578064, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.75, |
| "learning_rate": 0.0008860000000000001, |
| "loss": 0.1874, |
| "macro_f1": 0.2857142984867096, |
| "num_tokens": 699954.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3751397728919983, |
| "skip_count": 3.0, |
| "step": 444, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.425938566552901, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.28125, |
| "learning_rate": 0.0008900000000000001, |
| "loss": 0.2139, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 703477.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2166936844587326, |
| "skip_count": 2.0, |
| "step": 446, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.4368600682593855, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.000894, |
| "loss": 0.3078, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 706342.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.004165076185017824, |
| "skip_count": 0.0, |
| "step": 448, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.4477815699658705, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.5625, |
| "learning_rate": 0.000898, |
| "loss": 0.3248, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 709048.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.11787679046392441, |
| "skip_count": 1.0, |
| "step": 450, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.458703071672355, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.875, |
| "learning_rate": 0.000902, |
| "loss": 0.2151, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 712168.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.24694015085697174, |
| "skip_count": 0.0, |
| "step": 452, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.4696245733788396, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 9.0625, |
| "learning_rate": 0.000906, |
| "loss": 0.1899, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 715867.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.14055466651916504, |
| "skip_count": 2.0, |
| "step": 454, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.480546075085324, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.84375, |
| "learning_rate": 0.00091, |
| "loss": 0.136, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 718940.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2996567487716675, |
| "skip_count": 2.0, |
| "step": 456, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.491467576791809, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 8.5625, |
| "learning_rate": 0.0009140000000000001, |
| "loss": 0.2439, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 721407.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.032011453062295914, |
| "skip_count": 2.0, |
| "step": 458, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.5023890784982936, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 11.0, |
| "learning_rate": 0.0009180000000000001, |
| "loss": 0.2592, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 726056.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06647517532110214, |
| "skip_count": 0.0, |
| "step": 460, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.513310580204778, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.875, |
| "learning_rate": 0.0009220000000000001, |
| "loss": 0.1904, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 729038.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08919267356395721, |
| "skip_count": 0.0, |
| "step": 462, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.5242320819112627, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.46875, |
| "learning_rate": 0.0009260000000000001, |
| "loss": 0.1969, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 732172.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.4903416037559509, |
| "skip_count": 2.0, |
| "step": 464, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 2.5351535836177472, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 5.5, |
| "learning_rate": 0.00093, |
| "loss": 0.1957, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 735282.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.025489339604973793, |
| "skip_count": 2.0, |
| "step": 466, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.546075085324232, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.28125, |
| "learning_rate": 0.000934, |
| "loss": 0.2198, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 739208.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.013121264986693859, |
| "skip_count": 0.0, |
| "step": 468, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.5569965870307167, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.15625, |
| "learning_rate": 0.0009379999999999999, |
| "loss": 0.3641, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 741980.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.45740270614624023, |
| "skip_count": 2.0, |
| "step": 470, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.5679180887372013, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.000942, |
| "loss": 0.1668, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 745551.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1244814470410347, |
| "skip_count": 2.0, |
| "step": 472, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.5788395904436863, |
| "f1_execute": 0.8571428656578064, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 12.25, |
| "learning_rate": 0.000946, |
| "loss": 0.2807, |
| "macro_f1": 0.2857142984867096, |
| "num_tokens": 748488.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3303976058959961, |
| "skip_count": 3.0, |
| "step": 474, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.3333333432674408, |
| "acc_skip": 0.0, |
| "avg_layers": 30.0, |
| "epoch": 2.589761092150171, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.4000000059604645, |
| "f1_skip": 0.0, |
| "grad_norm": 3.640625, |
| "learning_rate": 0.00095, |
| "loss": 0.1353, |
| "macro_f1": 0.44705885648727417, |
| "num_tokens": 752865.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.24396798014640808, |
| "skip_count": 0.0, |
| "step": 476, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 2.6006825938566553, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 5.59375, |
| "learning_rate": 0.000954, |
| "loss": 0.1584, |
| "macro_f1": 0.4400000274181366, |
| "num_tokens": 755653.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.09343712776899338, |
| "skip_count": 3.0, |
| "step": 478, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.61160409556314, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.4375, |
| "learning_rate": 0.000958, |
| "loss": 0.2014, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 758567.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03879999741911888, |
| "skip_count": 1.0, |
| "step": 480, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.6225255972696244, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5625, |
| "learning_rate": 0.000962, |
| "loss": 0.2174, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 762013.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.13902239501476288, |
| "skip_count": 2.0, |
| "step": 482, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.6334470989761094, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.125, |
| "learning_rate": 0.000966, |
| "loss": 0.2322, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 764820.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0281832292675972, |
| "skip_count": 0.0, |
| "step": 484, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 2.644368600682594, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 11.25, |
| "learning_rate": 0.0009699999999999999, |
| "loss": 0.178, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 767962.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.3387240767478943, |
| "skip_count": 2.0, |
| "step": 486, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 2.6552901023890785, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.4375, |
| "learning_rate": 0.000974, |
| "loss": 0.1818, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 771189.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.033774666488170624, |
| "skip_count": 0.0, |
| "step": 488, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.666211604095563, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.875, |
| "learning_rate": 0.000978, |
| "loss": 0.2071, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 774073.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.009604716673493385, |
| "skip_count": 0.0, |
| "step": 490, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.6771331058020476, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.000982, |
| "loss": 0.1853, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 776722.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0034638401120901108, |
| "skip_count": 0.0, |
| "step": 492, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.6880546075085325, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.6875, |
| "learning_rate": 0.0009860000000000001, |
| "loss": 0.2882, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 780051.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08520562946796417, |
| "skip_count": 0.0, |
| "step": 494, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.698976109215017, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.875, |
| "learning_rate": 0.00099, |
| "loss": 0.1995, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 782813.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.16369783878326416, |
| "skip_count": 1.0, |
| "step": 496, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.7098976109215016, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.859375, |
| "learning_rate": 0.000994, |
| "loss": 0.1725, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 785376.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.17243081331253052, |
| "skip_count": 2.0, |
| "step": 498, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.25, |
| "avg_layers": 26.0, |
| "epoch": 2.7208191126279866, |
| "f1_execute": 0.8749999403953552, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 9.9375, |
| "learning_rate": 0.000998, |
| "loss": 0.1842, |
| "macro_f1": 0.402777761220932, |
| "num_tokens": 788030.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.15272235870361328, |
| "skip_count": 4.0, |
| "step": 500, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.731740614334471, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.4375, |
| "learning_rate": 0.0009999999674012276, |
| "loss": 0.1709, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 791099.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02299564890563488, |
| "skip_count": 0.0, |
| "step": 502, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.7426621160409557, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.96875, |
| "learning_rate": 0.000999999706611075, |
| "loss": 0.1858, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 794155.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0592501275241375, |
| "skip_count": 0.0, |
| "step": 504, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.75358361774744, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.71875, |
| "learning_rate": 0.0009999991850309056, |
| "loss": 0.1347, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 797457.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.07785549014806747, |
| "skip_count": 1.0, |
| "step": 506, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 2.7645051194539247, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 9.25, |
| "learning_rate": 0.0009999984026609918, |
| "loss": 0.1448, |
| "macro_f1": 0.4803921580314636, |
| "num_tokens": 800614.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.32612788677215576, |
| "skip_count": 2.0, |
| "step": 508, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.7754266211604097, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.84375, |
| "learning_rate": 0.0009999973595017412, |
| "loss": 0.2566, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 804027.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03253546729683876, |
| "skip_count": 0.0, |
| "step": 510, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 2.7863481228668943, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.0009999960555536983, |
| "loss": 0.1271, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 807662.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.16023527085781097, |
| "skip_count": 2.0, |
| "step": 512, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.797269624573379, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.375, |
| "learning_rate": 0.0009999944908175428, |
| "loss": 0.1876, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 810905.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.022885220125317574, |
| "skip_count": 0.0, |
| "step": 514, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.8081911262798633, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.078125, |
| "learning_rate": 0.0009999926652940912, |
| "loss": 0.1309, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 814110.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.007647325750440359, |
| "skip_count": 0.0, |
| "step": 516, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.819112627986348, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.375, |
| "learning_rate": 0.0009999905789842955, |
| "loss": 0.2302, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 816905.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.0514276959002018, |
| "skip_count": 0.0, |
| "step": 518, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.830034129692833, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.875, |
| "learning_rate": 0.0009999882318892442, |
| "loss": 0.2078, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 819821.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.3009680211544037, |
| "skip_count": 0.0, |
| "step": 520, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 2.8409556313993174, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.859375, |
| "learning_rate": 0.000999985624010161, |
| "loss": 0.1296, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 822580.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05273444578051567, |
| "skip_count": 1.0, |
| "step": 522, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.851877133105802, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.5625, |
| "learning_rate": 0.0009999827553484064, |
| "loss": 0.2293, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 825874.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.008311637677252293, |
| "skip_count": 0.0, |
| "step": 524, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 2.862798634812287, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.921875, |
| "learning_rate": 0.0009999796259054763, |
| "loss": 0.1759, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 829040.0, |
| "repeat_count": 3.0, |
| "routers_loss": 1.207849383354187, |
| "skip_count": 2.0, |
| "step": 526, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.8737201365187715, |
| "f1_execute": 0.9019608497619629, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.65625, |
| "learning_rate": 0.0009999762356830036, |
| "loss": 0.2089, |
| "macro_f1": 0.3006536364555359, |
| "num_tokens": 834261.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.5721967220306396, |
| "skip_count": 3.0, |
| "step": 528, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 2.884641638225256, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.6875, |
| "learning_rate": 0.000999972584682756, |
| "loss": 0.2308, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 837501.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.09908123314380646, |
| "skip_count": 2.0, |
| "step": 530, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 2.8955631399317405, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.0009999686729066381, |
| "loss": 0.1818, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 840390.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.04153004288673401, |
| "skip_count": 0.0, |
| "step": 532, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 26.0, |
| "epoch": 2.906484641638225, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 5.09375, |
| "learning_rate": 0.0009999645003566902, |
| "loss": 0.1759, |
| "macro_f1": 0.4400000274181366, |
| "num_tokens": 843327.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.37754446268081665, |
| "skip_count": 3.0, |
| "step": 534, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 2.91740614334471, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.953125, |
| "learning_rate": 0.0009999600670350882, |
| "loss": 0.1873, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 847028.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03440186381340027, |
| "skip_count": 2.0, |
| "step": 536, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 2.9283276450511946, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 12.875, |
| "learning_rate": 0.000999955372944145, |
| "loss": 0.342, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 850735.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.18292225897312164, |
| "skip_count": 0.0, |
| "step": 538, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.939249146757679, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.0009999504180863087, |
| "loss": 0.1714, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 854731.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.31060779094696045, |
| "skip_count": 1.0, |
| "step": 540, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.9501706484641637, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.65625, |
| "learning_rate": 0.0009999452024641636, |
| "loss": 0.1744, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 858249.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.09356094151735306, |
| "skip_count": 2.0, |
| "step": 542, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.961092150170648, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.4375, |
| "learning_rate": 0.0009999397260804302, |
| "loss": 0.1456, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 860901.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.006649349816143513, |
| "skip_count": 0.0, |
| "step": 544, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 2.972013651877133, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.0, |
| "learning_rate": 0.0009999339889379647, |
| "loss": 0.191, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 863756.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.024081196635961533, |
| "skip_count": 0.0, |
| "step": 546, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 2.9829351535836177, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.328125, |
| "learning_rate": 0.0009999279910397597, |
| "loss": 0.1806, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 867242.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06612888723611832, |
| "skip_count": 2.0, |
| "step": 548, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 2.9938566552901023, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.09375, |
| "learning_rate": 0.000999921732388943, |
| "loss": 0.1438, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 870235.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02564089559018612, |
| "skip_count": 0.0, |
| "step": 550, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.0, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 11.0, |
| "learning_rate": 0.0009999152129887801, |
| "loss": 0.1395, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 872748.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.31180688738822937, |
| "skip_count": 2.0, |
| "step": 552, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.8333333134651184, |
| "avg_layers": 25.0, |
| "epoch": 3.0109215017064845, |
| "f1_execute": 0.9523809552192688, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.9090909361839294, |
| "grad_norm": 7.8125, |
| "learning_rate": 0.0009999084328426704, |
| "loss": 0.1243, |
| "macro_f1": 0.8427128791809082, |
| "num_tokens": 876257.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.06441941112279892, |
| "skip_count": 6.0, |
| "step": 554, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.021843003412969, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.4375, |
| "learning_rate": 0.0009999013919541506, |
| "loss": 0.2276, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 879189.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1297590732574463, |
| "skip_count": 2.0, |
| "step": 556, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 3.032764505119454, |
| "f1_execute": 0.95652174949646, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.5714285373687744, |
| "grad_norm": 2.953125, |
| "learning_rate": 0.0009998940903268932, |
| "loss": 0.1034, |
| "macro_f1": 0.7315390110015869, |
| "num_tokens": 882626.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.40159890055656433, |
| "skip_count": 4.0, |
| "step": 558, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.0436860068259386, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.96875, |
| "learning_rate": 0.0009998865279647066, |
| "loss": 0.1627, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 885572.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05809749290347099, |
| "skip_count": 3.0, |
| "step": 560, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.054607508532423, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.8125, |
| "learning_rate": 0.0009998787048715349, |
| "loss": 0.1533, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 889088.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.4470720589160919, |
| "skip_count": 2.0, |
| "step": 562, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.0655290102389077, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.078125, |
| "learning_rate": 0.0009998706210514589, |
| "loss": 0.167, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 892449.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.017404144629836082, |
| "skip_count": 0.0, |
| "step": 564, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 24.0, |
| "epoch": 3.0764505119453927, |
| "f1_execute": 0.8749999403953552, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.5, |
| "learning_rate": 0.0009998622765086946, |
| "loss": 0.1492, |
| "macro_f1": 0.2916666567325592, |
| "num_tokens": 895586.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3639675974845886, |
| "skip_count": 1.0, |
| "step": 566, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 23.0, |
| "epoch": 3.087372013651877, |
| "f1_execute": 0.8979591727256775, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333134651184, |
| "grad_norm": 9.0625, |
| "learning_rate": 0.0009998536712475944, |
| "loss": 0.2095, |
| "macro_f1": 0.4104308485984802, |
| "num_tokens": 898285.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.16401837766170502, |
| "skip_count": 1.0, |
| "step": 568, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 3.0982935153583617, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.921875, |
| "learning_rate": 0.0009998448052726467, |
| "loss": 0.1679, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 901345.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.2740897238254547, |
| "skip_count": 1.0, |
| "step": 570, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.1092150170648463, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.921875, |
| "learning_rate": 0.000999835678588476, |
| "loss": 0.1513, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 904674.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.004289933945983648, |
| "skip_count": 0.0, |
| "step": 572, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 28.0, |
| "epoch": 3.1201365187713312, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 5.125, |
| "learning_rate": 0.0009998262911998423, |
| "loss": 0.2076, |
| "macro_f1": 0.47333335876464844, |
| "num_tokens": 908392.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.6915572881698608, |
| "skip_count": 3.0, |
| "step": 574, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 3.131058020477816, |
| "f1_execute": 0.9387754797935486, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 5.65625, |
| "learning_rate": 0.000999816643111642, |
| "loss": 0.166, |
| "macro_f1": 0.47959184646606445, |
| "num_tokens": 911574.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.27853959798812866, |
| "skip_count": 1.0, |
| "step": 576, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.1419795221843003, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.6875, |
| "learning_rate": 0.0009998067343289074, |
| "loss": 0.2197, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 914726.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.39462774991989136, |
| "skip_count": 1.0, |
| "step": 578, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.152901023890785, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.515625, |
| "learning_rate": 0.0009997965648568066, |
| "loss": 0.1345, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 918249.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0032140507828444242, |
| "skip_count": 0.0, |
| "step": 580, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.1638225255972694, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.1875, |
| "learning_rate": 0.000999786134700644, |
| "loss": 0.1132, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 921025.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0016512145521119237, |
| "skip_count": 0.0, |
| "step": 582, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 3.1747440273037544, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.59375, |
| "learning_rate": 0.0009997754438658595, |
| "loss": 0.0915, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 924102.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.6956021785736084, |
| "skip_count": 2.0, |
| "step": 584, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 3.185665529010239, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 9.1875, |
| "learning_rate": 0.0009997644923580293, |
| "loss": 0.1437, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 927662.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.32544562220573425, |
| "skip_count": 2.0, |
| "step": 586, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.1965870307167235, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.25, |
| "learning_rate": 0.0009997532801828658, |
| "loss": 0.1488, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 930556.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.00869440846145153, |
| "skip_count": 0.0, |
| "step": 588, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.207508532423208, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.65625, |
| "learning_rate": 0.0009997418073462167, |
| "loss": 0.1584, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 933435.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08498232066631317, |
| "skip_count": 2.0, |
| "step": 590, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.218430034129693, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.015625, |
| "learning_rate": 0.0009997300738540662, |
| "loss": 0.1075, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 936478.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.19423364102840424, |
| "skip_count": 2.0, |
| "step": 592, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 3.2293515358361775, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 6.03125, |
| "learning_rate": 0.000999718079712534, |
| "loss": 0.1615, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 939400.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02402239292860031, |
| "skip_count": 1.0, |
| "step": 594, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 3.240273037542662, |
| "f1_execute": 1.0, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.875, |
| "learning_rate": 0.0009997058249278763, |
| "loss": 0.221, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 943300.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.0028402789030224085, |
| "skip_count": 0.0, |
| "step": 596, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.2511945392491466, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.09375, |
| "learning_rate": 0.0009996933095064847, |
| "loss": 0.1423, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 947399.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.2962486445903778, |
| "skip_count": 2.0, |
| "step": 598, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.2621160409556316, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.625, |
| "learning_rate": 0.0009996805334548872, |
| "loss": 0.1535, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 950094.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.47425299882888794, |
| "skip_count": 4.0, |
| "step": 600, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.4000000059604645, |
| "avg_layers": 24.0, |
| "epoch": 3.273037542662116, |
| "f1_execute": 0.8636363744735718, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.444444477558136, |
| "grad_norm": 4.71875, |
| "learning_rate": 0.0009996674967797476, |
| "loss": 0.1282, |
| "macro_f1": 0.43602699041366577, |
| "num_tokens": 953673.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.3788261115550995, |
| "skip_count": 5.0, |
| "step": 602, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.2839590443686006, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.5625, |
| "learning_rate": 0.0009996541994878655, |
| "loss": 0.1239, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 956885.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.13212358951568604, |
| "skip_count": 0.0, |
| "step": 604, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 3.294880546075085, |
| "f1_execute": 0.9803921580314636, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 3.828125, |
| "learning_rate": 0.0009996406415861763, |
| "loss": 0.0874, |
| "macro_f1": 0.6601307392120361, |
| "num_tokens": 959794.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0332571342587471, |
| "skip_count": 2.0, |
| "step": 606, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.3058020477815697, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5625, |
| "learning_rate": 0.0009996268230817518, |
| "loss": 0.1068, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 963516.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.007200752384960651, |
| "skip_count": 0.0, |
| "step": 608, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.3167235494880547, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.75, |
| "learning_rate": 0.0009996127439817993, |
| "loss": 0.1237, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 966363.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.23764896392822266, |
| "skip_count": 1.0, |
| "step": 610, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.3276450511945392, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.484375, |
| "learning_rate": 0.0009995984042936621, |
| "loss": 0.1411, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 969265.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0006030416116118431, |
| "skip_count": 0.0, |
| "step": 612, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 3.3385665529010238, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.8125, |
| "learning_rate": 0.0009995838040248197, |
| "loss": 0.1516, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 972024.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.029178157448768616, |
| "skip_count": 1.0, |
| "step": 614, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 24.0, |
| "epoch": 3.3494880546075088, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.0009995689431828872, |
| "loss": 0.132, |
| "macro_f1": 0.41777777671813965, |
| "num_tokens": 974328.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.41580793261528015, |
| "skip_count": 2.0, |
| "step": 616, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.3604095563139933, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.625, |
| "learning_rate": 0.000999553821775616, |
| "loss": 0.1495, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 977628.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.26905494928359985, |
| "skip_count": 3.0, |
| "step": 618, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.371331058020478, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.375, |
| "learning_rate": 0.0009995384398108927, |
| "loss": 0.1372, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 980458.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.007225328590720892, |
| "skip_count": 0.0, |
| "step": 620, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 3.3822525597269624, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.984375, |
| "learning_rate": 0.0009995227972967404, |
| "loss": 0.1104, |
| "macro_f1": 0.6603773832321167, |
| "num_tokens": 983776.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.09698990732431412, |
| "skip_count": 1.0, |
| "step": 622, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.393174061433447, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.40625, |
| "learning_rate": 0.000999506894241318, |
| "loss": 0.1211, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 986625.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.028710627928376198, |
| "skip_count": 0.0, |
| "step": 624, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 3.404095563139932, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 5.53125, |
| "learning_rate": 0.0009994907306529201, |
| "loss": 0.186, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 989896.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.18436689674854279, |
| "skip_count": 2.0, |
| "step": 626, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 3.4150170648464164, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 5.71875, |
| "learning_rate": 0.0009994743065399776, |
| "loss": 0.1819, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 992963.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.011628196574747562, |
| "skip_count": 2.0, |
| "step": 628, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.425938566552901, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.5625, |
| "learning_rate": 0.0009994576219110565, |
| "loss": 0.2279, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 995486.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03694930672645569, |
| "skip_count": 0.0, |
| "step": 630, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.4368600682593855, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.0009994406767748596, |
| "loss": 0.2908, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 998880.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3335764706134796, |
| "skip_count": 1.0, |
| "step": 632, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 3.4477815699658705, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 5.3125, |
| "learning_rate": 0.000999423471140225, |
| "loss": 0.1652, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1001623.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03843867778778076, |
| "skip_count": 2.0, |
| "step": 634, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.458703071672355, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.15625, |
| "learning_rate": 0.0009994060050161268, |
| "loss": 0.1534, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 1004900.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.26561209559440613, |
| "skip_count": 1.0, |
| "step": 636, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 28.0, |
| "epoch": 3.4696245733788396, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 6.40625, |
| "learning_rate": 0.0009993882784116752, |
| "loss": 0.147, |
| "macro_f1": 0.4803921580314636, |
| "num_tokens": 1008732.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.3012487590312958, |
| "skip_count": 3.0, |
| "step": 638, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.480546075085324, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.15625, |
| "learning_rate": 0.0009993702913361155, |
| "loss": 0.1252, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1011699.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.012646762654185295, |
| "skip_count": 0.0, |
| "step": 640, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 3.491467576791809, |
| "f1_execute": 0.9411765336990356, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.90625, |
| "learning_rate": 0.0009993520437988302, |
| "loss": 0.1487, |
| "macro_f1": 0.480392187833786, |
| "num_tokens": 1014406.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.1068505123257637, |
| "skip_count": 3.0, |
| "step": 642, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.5023890784982936, |
| "f1_execute": 0.8085106015205383, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.34375, |
| "learning_rate": 0.000999333535809336, |
| "loss": 0.1731, |
| "macro_f1": 0.26950353384017944, |
| "num_tokens": 1017801.0, |
| "repeat_count": 2.0, |
| "routers_loss": 2.2939841747283936, |
| "skip_count": 5.0, |
| "step": 644, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.513310580204778, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.8125, |
| "learning_rate": 0.0009993147673772868, |
| "loss": 0.1609, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1021185.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02110578864812851, |
| "skip_count": 0.0, |
| "step": 646, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 3.5242320819112627, |
| "f1_execute": 0.9600000381469727, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.000999295738512472, |
| "loss": 0.124, |
| "macro_f1": 0.4533333480358124, |
| "num_tokens": 1025108.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15021832287311554, |
| "skip_count": 2.0, |
| "step": 648, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.5351535836177472, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.0, |
| "learning_rate": 0.0009992764492248163, |
| "loss": 0.2309, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1028805.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.002900304039940238, |
| "skip_count": 0.0, |
| "step": 650, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 26.0, |
| "epoch": 3.546075085324232, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 5.0, |
| "learning_rate": 0.0009992568995243808, |
| "loss": 0.1452, |
| "macro_f1": 0.44705885648727417, |
| "num_tokens": 1032069.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.2886044383049011, |
| "skip_count": 3.0, |
| "step": 652, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.5569965870307167, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.0625, |
| "learning_rate": 0.0009992370894213623, |
| "loss": 0.1319, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 1035634.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.42971259355545044, |
| "skip_count": 2.0, |
| "step": 654, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 3.5679180887372013, |
| "f1_execute": 0.9387754797935486, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 7.375, |
| "learning_rate": 0.000999217018926093, |
| "loss": 0.1152, |
| "macro_f1": 0.7795917987823486, |
| "num_tokens": 1039948.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.07567094266414642, |
| "skip_count": 3.0, |
| "step": 656, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.5788395904436863, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.0009991966880490417, |
| "loss": 0.1425, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1043710.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.001569207408465445, |
| "skip_count": 0.0, |
| "step": 658, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.589761092150171, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.453125, |
| "learning_rate": 0.0009991760968008124, |
| "loss": 0.1177, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1047211.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.014489148743450642, |
| "skip_count": 0.0, |
| "step": 660, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.6006825938566553, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.34375, |
| "learning_rate": 0.0009991552451921453, |
| "loss": 0.104, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1050220.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.052834026515483856, |
| "skip_count": 1.0, |
| "step": 662, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.61160409556314, |
| "f1_execute": 0.875, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.0009991341332339157, |
| "loss": 0.1706, |
| "macro_f1": 0.625, |
| "num_tokens": 1053982.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.2865705192089081, |
| "skip_count": 3.0, |
| "step": 664, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 3.6225255972696244, |
| "f1_execute": 0.923076868057251, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.25, |
| "learning_rate": 0.0009991127609371357, |
| "loss": 0.1275, |
| "macro_f1": 0.307692289352417, |
| "num_tokens": 1056846.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.32878634333610535, |
| "skip_count": 0.0, |
| "step": 666, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 3.6334470989761094, |
| "f1_execute": 0.9777777791023254, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 1.0, |
| "grad_norm": 3.328125, |
| "learning_rate": 0.0009990911283129524, |
| "loss": 0.1348, |
| "macro_f1": 0.8814815282821655, |
| "num_tokens": 1059648.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.10558832436800003, |
| "skip_count": 4.0, |
| "step": 668, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 3.644368600682594, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.0009990692353726489, |
| "loss": 0.0572, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1062290.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0071791489608585835, |
| "skip_count": 2.0, |
| "step": 670, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.6552901023890785, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.0625, |
| "learning_rate": 0.0009990470821276442, |
| "loss": 0.156, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1065212.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.028384100645780563, |
| "skip_count": 0.0, |
| "step": 672, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 3.666211604095563, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 7.4375, |
| "learning_rate": 0.0009990246685894933, |
| "loss": 0.1457, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1068029.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03461477532982826, |
| "skip_count": 2.0, |
| "step": 674, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.6771331058020476, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.78125, |
| "learning_rate": 0.0009990019947698863, |
| "loss": 0.1055, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1071229.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.004003713373094797, |
| "skip_count": 0.0, |
| "step": 676, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.6666666865348816, |
| "avg_layers": 26.0, |
| "epoch": 3.6880546075085325, |
| "f1_execute": 0.9803921580314636, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.800000011920929, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.0009989790606806494, |
| "loss": 0.1026, |
| "macro_f1": 0.5934640765190125, |
| "num_tokens": 1074046.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03134514391422272, |
| "skip_count": 3.0, |
| "step": 678, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 3.698976109215017, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.0009989558663337447, |
| "loss": 0.1402, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1076635.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.00439166184514761, |
| "skip_count": 1.0, |
| "step": 680, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 3.7098976109215016, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.09375, |
| "learning_rate": 0.0009989324117412699, |
| "loss": 0.1021, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1079958.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12589046359062195, |
| "skip_count": 2.0, |
| "step": 682, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.7208191126279866, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.890625, |
| "learning_rate": 0.0009989086969154587, |
| "loss": 0.1762, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1082589.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.01050520222634077, |
| "skip_count": 0.0, |
| "step": 684, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.731740614334471, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.0009988847218686796, |
| "loss": 0.1527, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1085634.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08884720504283905, |
| "skip_count": 1.0, |
| "step": 686, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 3.7426621160409557, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.5625, |
| "learning_rate": 0.0009988604866134384, |
| "loss": 0.196, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 1088501.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3627224862575531, |
| "skip_count": 2.0, |
| "step": 688, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.75358361774744, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.9375, |
| "learning_rate": 0.0009988359911623748, |
| "loss": 0.2456, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1091083.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.025369791314005852, |
| "skip_count": 0.0, |
| "step": 690, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.7645051194539247, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.000998811235528266, |
| "loss": 0.1186, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1095673.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.023373540490865707, |
| "skip_count": 0.0, |
| "step": 692, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.7754266211604097, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.03125, |
| "learning_rate": 0.0009987862197240237, |
| "loss": 0.1518, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1098519.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.014006087556481361, |
| "skip_count": 0.0, |
| "step": 694, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.7863481228668943, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.59375, |
| "learning_rate": 0.0009987609437626954, |
| "loss": 0.2149, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1101510.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.057559430599212646, |
| "skip_count": 1.0, |
| "step": 696, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.797269624573379, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.765625, |
| "learning_rate": 0.0009987354076574648, |
| "loss": 0.1507, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1104637.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.001837484072893858, |
| "skip_count": 0.0, |
| "step": 698, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.8081911262798633, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.53125, |
| "learning_rate": 0.0009987096114216511, |
| "loss": 0.1046, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1107964.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.3758608400821686, |
| "skip_count": 1.0, |
| "step": 700, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 3.819112627986348, |
| "f1_execute": 0.9803921580314636, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.800000011920929, |
| "grad_norm": 4.375, |
| "learning_rate": 0.000998683555068709, |
| "loss": 0.1269, |
| "macro_f1": 0.5934640765190125, |
| "num_tokens": 1111541.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02019377611577511, |
| "skip_count": 2.0, |
| "step": 702, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.830034129692833, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.671875, |
| "learning_rate": 0.000998657238612229, |
| "loss": 0.1522, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1114819.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.019685756415128708, |
| "skip_count": 0.0, |
| "step": 704, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.8409556313993174, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.0009986306620659374, |
| "loss": 0.1104, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1117888.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0059326752088963985, |
| "skip_count": 0.0, |
| "step": 706, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 3.851877133105802, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.125, |
| "learning_rate": 0.0009986038254436956, |
| "loss": 0.1038, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 1120946.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.022552471607923508, |
| "skip_count": 0.0, |
| "step": 708, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 3.862798634812287, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.625, |
| "learning_rate": 0.0009985767287595015, |
| "loss": 0.1433, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1124013.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03914980590343475, |
| "skip_count": 2.0, |
| "step": 710, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 3.8737201365187715, |
| "f1_execute": 1.0, |
| "f1_repeat": 1.0, |
| "f1_skip": 1.0, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.0009985493720274879, |
| "loss": 0.1663, |
| "macro_f1": 1.0, |
| "num_tokens": 1127662.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.01359120849519968, |
| "skip_count": 2.0, |
| "step": 712, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 3.884641638225256, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.96875, |
| "learning_rate": 0.0009985217552619236, |
| "loss": 0.1134, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1130742.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0699341893196106, |
| "skip_count": 0.0, |
| "step": 714, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.8955631399317405, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.75, |
| "learning_rate": 0.000998493878477213, |
| "loss": 0.1643, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1133386.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.006396451499313116, |
| "skip_count": 0.0, |
| "step": 716, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.3333333432674408, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 3.906484641638225, |
| "f1_execute": 0.8292683362960815, |
| "f1_repeat": 0.3333333432674408, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.75, |
| "learning_rate": 0.0009984657416878962, |
| "loss": 0.1396, |
| "macro_f1": 0.6097561120986938, |
| "num_tokens": 1136071.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.23587316274642944, |
| "skip_count": 6.0, |
| "step": 718, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.91740614334471, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 9.625, |
| "learning_rate": 0.0009984373449086485, |
| "loss": 0.1686, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 1139061.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.23841485381126404, |
| "skip_count": 2.0, |
| "step": 720, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 3.9283276450511946, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.0009984086881542815, |
| "loss": 0.1112, |
| "macro_f1": 0.5288889408111572, |
| "num_tokens": 1141926.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.37492331862449646, |
| "skip_count": 3.0, |
| "step": 722, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 3.939249146757679, |
| "f1_execute": 0.9166666865348816, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 4.375, |
| "learning_rate": 0.0009983797714397415, |
| "loss": 0.1395, |
| "macro_f1": 0.6611111164093018, |
| "num_tokens": 1145302.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.5061943531036377, |
| "skip_count": 2.0, |
| "step": 724, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.9501706484641637, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 11.5625, |
| "learning_rate": 0.0009983505947801115, |
| "loss": 0.327, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1148991.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.030050436034798622, |
| "skip_count": 0.0, |
| "step": 726, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 3.961092150170648, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.0009983211581906088, |
| "loss": 0.2311, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1151711.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.04163246229290962, |
| "skip_count": 2.0, |
| "step": 728, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 3.972013651877133, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.0625, |
| "learning_rate": 0.0009982914616865875, |
| "loss": 0.1956, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1155061.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.002654903568327427, |
| "skip_count": 0.0, |
| "step": 730, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 3.9829351535836177, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.0009982615052835364, |
| "loss": 0.1239, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1158043.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.18476539850234985, |
| "skip_count": 2.0, |
| "step": 732, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 3.9938566552901023, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.625, |
| "learning_rate": 0.0009982312889970804, |
| "loss": 0.211, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1161487.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.33558642864227295, |
| "skip_count": 0.0, |
| "step": 734, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.0, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.5625, |
| "learning_rate": 0.0009982008128429794, |
| "loss": 0.14, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1163664.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.010565636679530144, |
| "skip_count": 0.0, |
| "step": 736, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.010921501706485, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.75, |
| "learning_rate": 0.0009981700768371296, |
| "loss": 0.0823, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1166461.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.001561413868330419, |
| "skip_count": 0.0, |
| "step": 738, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 4.021843003412969, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 5.125, |
| "learning_rate": 0.000998139080995562, |
| "loss": 0.1766, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1170134.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.010665918700397015, |
| "skip_count": 2.0, |
| "step": 740, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 4.032764505119454, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.578125, |
| "learning_rate": 0.0009981078253344432, |
| "loss": 0.1177, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1173075.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.047345057129859924, |
| "skip_count": 1.0, |
| "step": 742, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 4.043686006825938, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.000998076309870076, |
| "loss": 0.0517, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1176281.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0033105311449617147, |
| "skip_count": 1.0, |
| "step": 744, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.054607508532423, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.0625, |
| "learning_rate": 0.000998044534618898, |
| "loss": 0.0864, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 1179403.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.033084314316511154, |
| "skip_count": 0.0, |
| "step": 746, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.065529010238908, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.0009980124995974827, |
| "loss": 0.0925, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 1182596.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.21827591955661774, |
| "skip_count": 3.0, |
| "step": 748, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 4.076450511945392, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.125, |
| "learning_rate": 0.0009979802048225388, |
| "loss": 0.1244, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1186303.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.18225915729999542, |
| "skip_count": 3.0, |
| "step": 750, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 30.0, |
| "epoch": 4.087372013651877, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 3.984375, |
| "learning_rate": 0.0009979476503109107, |
| "loss": 0.0728, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1189299.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.03163563460111618, |
| "skip_count": 0.0, |
| "step": 752, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 24.0, |
| "epoch": 4.098293515358362, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 6.34375, |
| "learning_rate": 0.000997914836079578, |
| "loss": 0.148, |
| "macro_f1": 0.41777777671813965, |
| "num_tokens": 1192694.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.28674715757369995, |
| "skip_count": 2.0, |
| "step": 754, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.109215017064846, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.0009978817621456562, |
| "loss": 0.0869, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1196319.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05852695554494858, |
| "skip_count": 1.0, |
| "step": 756, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 4.120136518771331, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.000997848428526396, |
| "loss": 0.0648, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1199844.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06834150850772858, |
| "skip_count": 2.0, |
| "step": 758, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.131058020477815, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.375, |
| "learning_rate": 0.0009978148352391835, |
| "loss": 0.0801, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1202876.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0058227707631886005, |
| "skip_count": 0.0, |
| "step": 760, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 4.1419795221843, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.00099778098230154, |
| "loss": 0.1094, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1206870.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.079805389046669, |
| "skip_count": 3.0, |
| "step": 762, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.152901023890785, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.0009977468697311232, |
| "loss": 0.0902, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 1209825.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.21695999801158905, |
| "skip_count": 2.0, |
| "step": 764, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.5, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.163822525597269, |
| "f1_execute": 0.8749999403953552, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.0, |
| "grad_norm": 3.265625, |
| "learning_rate": 0.0009977124975457249, |
| "loss": 0.1244, |
| "macro_f1": 0.5138888955116272, |
| "num_tokens": 1213093.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.12744387984275818, |
| "skip_count": 4.0, |
| "step": 766, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 4.174744027303754, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.0009976778657632733, |
| "loss": 0.0783, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 1216291.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.07573267817497253, |
| "skip_count": 2.0, |
| "step": 768, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.1856655290102385, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.0009976429744018313, |
| "loss": 0.0752, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1219537.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0009250715957023203, |
| "skip_count": 0.0, |
| "step": 770, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.1965870307167235, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.0009976078234795983, |
| "loss": 0.1114, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1222736.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.00175693747587502, |
| "skip_count": 0.0, |
| "step": 772, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 4.207508532423208, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.390625, |
| "learning_rate": 0.0009975724130149076, |
| "loss": 0.0918, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1226120.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.027441009879112244, |
| "skip_count": 2.0, |
| "step": 774, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.2184300341296925, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.0009975367430262287, |
| "loss": 0.0992, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1228810.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.027025407180190086, |
| "skip_count": 0.0, |
| "step": 776, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.2293515358361775, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.0009975008135321667, |
| "loss": 0.0931, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1231669.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.00917113944888115, |
| "skip_count": 0.0, |
| "step": 778, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.2402730375426625, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.890625, |
| "learning_rate": 0.0009974646245514615, |
| "loss": 0.0505, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1234476.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.010482276789844036, |
| "skip_count": 0.0, |
| "step": 780, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 4.251194539249147, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 4.28125, |
| "learning_rate": 0.0009974281761029886, |
| "loss": 0.0675, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1237748.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.009005382657051086, |
| "skip_count": 1.0, |
| "step": 782, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 4.262116040955632, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 2.9375, |
| "learning_rate": 0.0009973914682057587, |
| "loss": 0.1734, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1240362.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.09049399197101593, |
| "skip_count": 2.0, |
| "step": 784, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.273037542662116, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.984375, |
| "learning_rate": 0.0009973545008789182, |
| "loss": 0.1156, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1244147.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0037465172354131937, |
| "skip_count": 0.0, |
| "step": 786, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.283959044368601, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.84375, |
| "learning_rate": 0.000997317274141748, |
| "loss": 0.1302, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1247058.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.002100529847666621, |
| "skip_count": 0.0, |
| "step": 788, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 24.0, |
| "epoch": 4.294880546075086, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.3333333432674408, |
| "grad_norm": 3.03125, |
| "learning_rate": 0.0009972797880136654, |
| "loss": 0.0771, |
| "macro_f1": 0.41777777671813965, |
| "num_tokens": 1250331.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08377297967672348, |
| "skip_count": 2.0, |
| "step": 790, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 4.30580204778157, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.421875, |
| "learning_rate": 0.0009972420425142224, |
| "loss": 0.0782, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1253848.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06583717465400696, |
| "skip_count": 2.0, |
| "step": 792, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.316723549488055, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.28125, |
| "learning_rate": 0.0009972040376631057, |
| "loss": 0.1235, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1257122.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12353084981441498, |
| "skip_count": 1.0, |
| "step": 794, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.327645051194539, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.0009971657734801384, |
| "loss": 0.0899, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1261136.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.004150724504143, |
| "skip_count": 0.0, |
| "step": 796, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.338566552901024, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.5625, |
| "learning_rate": 0.0009971272499852784, |
| "loss": 0.1815, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1264211.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02800264209508896, |
| "skip_count": 0.0, |
| "step": 798, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 4.349488054607509, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.125, |
| "learning_rate": 0.0009970884671986187, |
| "loss": 0.1118, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1266964.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05382822826504707, |
| "skip_count": 1.0, |
| "step": 800, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.360409556313993, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.1875, |
| "learning_rate": 0.0009970494251403874, |
| "loss": 0.1015, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1269856.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.20994320511817932, |
| "skip_count": 2.0, |
| "step": 802, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.371331058020478, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.000997010123830948, |
| "loss": 0.1095, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1272945.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.07841377705335617, |
| "skip_count": 1.0, |
| "step": 804, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 30.0, |
| "epoch": 4.382252559726963, |
| "f1_execute": 1.0, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.5625, |
| "learning_rate": 0.0009969705632907999, |
| "loss": 0.1242, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1276127.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.008330464363098145, |
| "skip_count": 0.0, |
| "step": 806, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.393174061433447, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.4375, |
| "learning_rate": 0.0009969307435405766, |
| "loss": 0.1688, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1279056.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.004059277940541506, |
| "skip_count": 0.0, |
| "step": 808, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.404095563139932, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.0009968906646010474, |
| "loss": 0.1232, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1282092.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.005245010834187269, |
| "skip_count": 0.0, |
| "step": 810, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.415017064846416, |
| "f1_execute": 0.9411765336990356, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.3125, |
| "learning_rate": 0.0009968503264931167, |
| "loss": 0.0964, |
| "macro_f1": 0.6470588445663452, |
| "num_tokens": 1285759.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.04135916382074356, |
| "skip_count": 0.0, |
| "step": 812, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 4.425938566552901, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.0, |
| "learning_rate": 0.0009968097292378244, |
| "loss": 0.1636, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1288141.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.11239507049322128, |
| "skip_count": 1.0, |
| "step": 814, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.436860068259386, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.71875, |
| "learning_rate": 0.0009967688728563446, |
| "loss": 0.1044, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1291293.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3831826150417328, |
| "skip_count": 0.0, |
| "step": 816, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.44778156996587, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.875, |
| "learning_rate": 0.0009967277573699875, |
| "loss": 0.1445, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 1293847.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.054437290877103806, |
| "skip_count": 0.0, |
| "step": 818, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.458703071672355, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.000996686382800198, |
| "loss": 0.0712, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1296724.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.012091469950973988, |
| "skip_count": 0.0, |
| "step": 820, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.75, |
| "avg_layers": 24.0, |
| "epoch": 4.46962457337884, |
| "f1_execute": 0.936170220375061, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.75, |
| "grad_norm": 4.4375, |
| "learning_rate": 0.000996644749168557, |
| "loss": 0.1332, |
| "macro_f1": 0.5620567798614502, |
| "num_tokens": 1299674.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.06590834259986877, |
| "skip_count": 4.0, |
| "step": 822, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 25.0, |
| "epoch": 4.480546075085324, |
| "f1_execute": 0.9200000166893005, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 3.265625, |
| "learning_rate": 0.0009966028564967785, |
| "loss": 0.1285, |
| "macro_f1": 0.4400000274181366, |
| "num_tokens": 1302843.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.06902799010276794, |
| "skip_count": 2.0, |
| "step": 824, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 25.0, |
| "epoch": 4.491467576791809, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 7.4375, |
| "learning_rate": 0.0009965607048067137, |
| "loss": 0.1249, |
| "macro_f1": 0.44705885648727417, |
| "num_tokens": 1305575.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08320864289999008, |
| "skip_count": 2.0, |
| "step": 826, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 4.502389078498293, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.65625, |
| "learning_rate": 0.0009965182941203481, |
| "loss": 0.1834, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1308244.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12352414429187775, |
| "skip_count": 1.0, |
| "step": 828, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.513310580204778, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.9375, |
| "learning_rate": 0.0009964756244598021, |
| "loss": 0.0915, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1311314.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.014358235523104668, |
| "skip_count": 0.0, |
| "step": 830, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.524232081911263, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.65625, |
| "learning_rate": 0.0009964326958473316, |
| "loss": 0.102, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1315495.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.008667540736496449, |
| "skip_count": 0.0, |
| "step": 832, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.535153583617747, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.484375, |
| "learning_rate": 0.000996389508305327, |
| "loss": 0.0822, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1319132.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.018217027187347412, |
| "skip_count": 0.0, |
| "step": 834, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.546075085324232, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 10.8125, |
| "learning_rate": 0.000996346061856314, |
| "loss": 0.2215, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1321294.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1659325808286667, |
| "skip_count": 1.0, |
| "step": 836, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.556996587030717, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.625, |
| "learning_rate": 0.0009963023565229536, |
| "loss": 0.1108, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1324186.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.11435546725988388, |
| "skip_count": 0.0, |
| "step": 838, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.567918088737201, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.0009962583923280419, |
| "loss": 0.1153, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1327215.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.001215719268657267, |
| "skip_count": 0.0, |
| "step": 840, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.578839590443686, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.5625, |
| "learning_rate": 0.0009962141692945092, |
| "loss": 0.1181, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1330394.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.05636778846383095, |
| "skip_count": 0.0, |
| "step": 842, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 25.0, |
| "epoch": 4.58976109215017, |
| "f1_execute": 0.9803921580314636, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.800000011920929, |
| "grad_norm": 5.53125, |
| "learning_rate": 0.0009961696874454219, |
| "loss": 0.0985, |
| "macro_f1": 0.5934640765190125, |
| "num_tokens": 1333840.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.17423874139785767, |
| "skip_count": 2.0, |
| "step": 844, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 4.600682593856655, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.375, |
| "learning_rate": 0.0009961249468039806, |
| "loss": 0.1442, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1337481.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08344361186027527, |
| "skip_count": 0.0, |
| "step": 846, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 4.611604095563139, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 8.1875, |
| "learning_rate": 0.0009960799473935212, |
| "loss": 0.1287, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 1340525.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.10816935449838638, |
| "skip_count": 2.0, |
| "step": 848, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.622525597269624, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.703125, |
| "learning_rate": 0.0009960346892375143, |
| "loss": 0.1476, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1344963.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.02773604914546013, |
| "skip_count": 0.0, |
| "step": 850, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.633447098976109, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.25, |
| "learning_rate": 0.000995989172359566, |
| "loss": 0.074, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 1347911.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.07946910709142685, |
| "skip_count": 3.0, |
| "step": 852, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.6443686006825935, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.5625, |
| "learning_rate": 0.0009959433967834167, |
| "loss": 0.0946, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1352093.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.20672957599163055, |
| "skip_count": 1.0, |
| "step": 854, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.6666666865348816, |
| "acc_skip": 0.5, |
| "avg_layers": 28.0, |
| "epoch": 4.6552901023890785, |
| "f1_execute": 0.8780487775802612, |
| "f1_repeat": 0.6666666865348816, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.109375, |
| "learning_rate": 0.0009958973625329424, |
| "loss": 0.1035, |
| "macro_f1": 0.737127423286438, |
| "num_tokens": 1355052.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.14273089170455933, |
| "skip_count": 6.0, |
| "step": 856, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.6662116040955635, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.15625, |
| "learning_rate": 0.0009958510696321532, |
| "loss": 0.1217, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 1358739.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03209677338600159, |
| "skip_count": 0.0, |
| "step": 858, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.6771331058020476, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.609375, |
| "learning_rate": 0.000995804518105195, |
| "loss": 0.1511, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1361816.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.016142090782523155, |
| "skip_count": 0.0, |
| "step": 860, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.6880546075085325, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.03125, |
| "learning_rate": 0.0009957577079763478, |
| "loss": 0.1588, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1365188.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.005357397720217705, |
| "skip_count": 0.0, |
| "step": 862, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.6989761092150175, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.53125, |
| "learning_rate": 0.0009957106392700272, |
| "loss": 0.0981, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1368207.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.005774896126240492, |
| "skip_count": 0.0, |
| "step": 864, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.709897610921502, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.859375, |
| "learning_rate": 0.000995663312010783, |
| "loss": 0.1432, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1370949.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0034105523955076933, |
| "skip_count": 0.0, |
| "step": 866, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.720819112627987, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.0009956157262233003, |
| "loss": 0.1171, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1373855.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.00975721050053835, |
| "skip_count": 0.0, |
| "step": 868, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 27.0, |
| "epoch": 4.731740614334471, |
| "f1_execute": 0.8979592323303223, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 9.8125, |
| "learning_rate": 0.000995567881932399, |
| "loss": 0.1658, |
| "macro_f1": 0.4326530694961548, |
| "num_tokens": 1376396.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3017057776451111, |
| "skip_count": 3.0, |
| "step": 870, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.742662116040956, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.5625, |
| "learning_rate": 0.0009955197791630336, |
| "loss": 0.141, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1379027.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.008239896968007088, |
| "skip_count": 0.0, |
| "step": 872, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.753583617747441, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.53125, |
| "learning_rate": 0.0009954714179402936, |
| "loss": 0.1144, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1382288.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.010364998131990433, |
| "skip_count": 0.0, |
| "step": 874, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 4.764505119453925, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 6.53125, |
| "learning_rate": 0.0009954227982894035, |
| "loss": 0.1795, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1385672.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15057335793972015, |
| "skip_count": 1.0, |
| "step": 876, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.77542662116041, |
| "f1_execute": 0.8799999952316284, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.90625, |
| "learning_rate": 0.0009953739202357217, |
| "loss": 0.1139, |
| "macro_f1": 0.29333335161209106, |
| "num_tokens": 1389206.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.42493173480033875, |
| "skip_count": 3.0, |
| "step": 878, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.786348122866894, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.46875, |
| "learning_rate": 0.0009953247838047428, |
| "loss": 0.1882, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1392492.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.005968689452856779, |
| "skip_count": 0.0, |
| "step": 880, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.797269624573379, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.046875, |
| "learning_rate": 0.0009952753890220948, |
| "loss": 0.1183, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1395478.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.14635904133319855, |
| "skip_count": 1.0, |
| "step": 882, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 4.808191126279864, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.0009952257359135417, |
| "loss": 0.1388, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 1398518.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1135154739022255, |
| "skip_count": 2.0, |
| "step": 884, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 28.0, |
| "epoch": 4.819112627986348, |
| "f1_execute": 0.9411765336990356, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.65625, |
| "learning_rate": 0.0009951758245049808, |
| "loss": 0.179, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 1401259.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.18914444744586945, |
| "skip_count": 1.0, |
| "step": 886, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 4.830034129692833, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.5, |
| "learning_rate": 0.0009951256548224455, |
| "loss": 0.0913, |
| "macro_f1": 0.6603773832321167, |
| "num_tokens": 1404149.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.04007445275783539, |
| "skip_count": 1.0, |
| "step": 888, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.840955631399318, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.15625, |
| "learning_rate": 0.000995075226892103, |
| "loss": 0.129, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1406960.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.4282263517379761, |
| "skip_count": 1.0, |
| "step": 890, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.5714285969734192, |
| "avg_layers": 27.0, |
| "epoch": 4.851877133105802, |
| "f1_execute": 0.8999999761581421, |
| "f1_repeat": 0.800000011920929, |
| "f1_skip": 0.7272727489471436, |
| "grad_norm": 5.40625, |
| "learning_rate": 0.0009950245407402557, |
| "loss": 0.2196, |
| "macro_f1": 0.8090909719467163, |
| "num_tokens": 1409634.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.3470841348171234, |
| "skip_count": 7.0, |
| "step": 892, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 24.0, |
| "epoch": 4.862798634812287, |
| "f1_execute": 0.9795917868614197, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.0009949735963933404, |
| "loss": 0.115, |
| "macro_f1": 0.5487528443336487, |
| "num_tokens": 1413390.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.05957069247961044, |
| "skip_count": 2.0, |
| "step": 894, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.873720136518771, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.53125, |
| "learning_rate": 0.0009949223938779286, |
| "loss": 0.0754, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1416605.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.002007940784096718, |
| "skip_count": 0.0, |
| "step": 896, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 4.884641638225256, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 8.1875, |
| "learning_rate": 0.000994870933220727, |
| "loss": 0.1282, |
| "macro_f1": 0.4803921580314636, |
| "num_tokens": 1420764.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08513174206018448, |
| "skip_count": 2.0, |
| "step": 898, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.895563139931741, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.09375, |
| "learning_rate": 0.0009948192144485757, |
| "loss": 0.0972, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1424182.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03853657469153404, |
| "skip_count": 1.0, |
| "step": 900, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 4.906484641638225, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 5.03125, |
| "learning_rate": 0.0009947672375884506, |
| "loss": 0.1737, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1426986.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.008192243054509163, |
| "skip_count": 1.0, |
| "step": 902, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 4.91740614334471, |
| "f1_execute": 0.9795917868614197, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.800000011920929, |
| "grad_norm": 4.875, |
| "learning_rate": 0.0009947150026674621, |
| "loss": 0.0577, |
| "macro_f1": 0.9265305995941162, |
| "num_tokens": 1429981.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.06954901665449142, |
| "skip_count": 2.0, |
| "step": 904, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.928327645051194, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.234375, |
| "learning_rate": 0.0009946625097128543, |
| "loss": 0.168, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1432902.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0880909413099289, |
| "skip_count": 1.0, |
| "step": 906, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 4.939249146757679, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 1.9921875, |
| "learning_rate": 0.000994609758752007, |
| "loss": 0.1445, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1436788.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.5064544081687927, |
| "skip_count": 0.0, |
| "step": 908, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 4.950170648464164, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.0625, |
| "learning_rate": 0.0009945567498124339, |
| "loss": 0.1658, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1439507.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.019065011292696, |
| "skip_count": 2.0, |
| "step": 910, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.961092150170648, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.40625, |
| "learning_rate": 0.0009945034829217832, |
| "loss": 0.0968, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1442860.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.018776487559080124, |
| "skip_count": 0.0, |
| "step": 912, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 4.972013651877133, |
| "f1_execute": 0.9230769276618958, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.71875, |
| "learning_rate": 0.0009944499581078382, |
| "loss": 0.1252, |
| "macro_f1": 0.3076923191547394, |
| "num_tokens": 1446637.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1531504988670349, |
| "skip_count": 2.0, |
| "step": 914, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 4.982935153583618, |
| "f1_execute": 0.943396270275116, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.21875, |
| "learning_rate": 0.000994396175398516, |
| "loss": 0.0992, |
| "macro_f1": 0.3144654333591461, |
| "num_tokens": 1450238.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1735955774784088, |
| "skip_count": 0.0, |
| "step": 916, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 4.993856655290102, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 7.6875, |
| "learning_rate": 0.000994342134821869, |
| "loss": 0.1523, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1453160.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.15269255638122559, |
| "skip_count": 0.0, |
| "step": 918, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 5.0, |
| "f1_execute": 0.9433962106704712, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 12.4375, |
| "learning_rate": 0.0009942878364060837, |
| "loss": 0.1131, |
| "macro_f1": 0.31446540355682373, |
| "num_tokens": 1454580.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.2639358341693878, |
| "skip_count": 0.0, |
| "step": 920, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 5.010921501706485, |
| "f1_execute": 0.9803921580314636, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 5.09375, |
| "learning_rate": 0.0009942332801794807, |
| "loss": 0.1702, |
| "macro_f1": 0.6601307392120361, |
| "num_tokens": 1457292.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.043732915073633194, |
| "skip_count": 2.0, |
| "step": 922, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 28.0, |
| "epoch": 5.021843003412969, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.000994178466170516, |
| "loss": 0.1107, |
| "macro_f1": 0.6538461446762085, |
| "num_tokens": 1460434.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.36936479806900024, |
| "skip_count": 1.0, |
| "step": 924, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 5.032764505119454, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.0009941233944077788, |
| "loss": 0.0547, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1463373.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0019650806207209826, |
| "skip_count": 1.0, |
| "step": 926, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.043686006825938, |
| "f1_execute": 0.9629629254341125, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 1.125, |
| "learning_rate": 0.000994068064919994, |
| "loss": 0.0665, |
| "macro_f1": 0.32098764181137085, |
| "num_tokens": 1466927.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.06489580124616623, |
| "skip_count": 1.0, |
| "step": 928, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 27.0, |
| "epoch": 5.054607508532423, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.203125, |
| "learning_rate": 0.0009940124777360203, |
| "loss": 0.0898, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1469834.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.013250669464468956, |
| "skip_count": 0.0, |
| "step": 930, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.065529010238908, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.0009939566328848507, |
| "loss": 0.0616, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1472714.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.03642500564455986, |
| "skip_count": 1.0, |
| "step": 932, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.076450511945392, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.000993900530395613, |
| "loss": 0.0672, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1476458.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.019950609654188156, |
| "skip_count": 2.0, |
| "step": 934, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.087372013651877, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 4.34375, |
| "learning_rate": 0.0009938441702975688, |
| "loss": 0.0714, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1479499.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.05769496411085129, |
| "skip_count": 2.0, |
| "step": 936, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 26.0, |
| "epoch": 5.098293515358362, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 3.09375, |
| "learning_rate": 0.000993787552620115, |
| "loss": 0.0647, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1482112.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.006518410053104162, |
| "skip_count": 2.0, |
| "step": 938, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.109215017064846, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.359375, |
| "learning_rate": 0.0009937306773927816, |
| "loss": 0.0569, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1485128.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.16481046378612518, |
| "skip_count": 2.0, |
| "step": 940, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.120136518771331, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.375, |
| "learning_rate": 0.0009936735446452341, |
| "loss": 0.0689, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1487854.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.00462290458381176, |
| "skip_count": 0.0, |
| "step": 942, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.131058020477815, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.0009936161544072716, |
| "loss": 0.0596, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1490795.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0042699906043708324, |
| "skip_count": 0.0, |
| "step": 944, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.1419795221843, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.78125, |
| "learning_rate": 0.0009935585067088275, |
| "loss": 0.1091, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1494150.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.01713154837489128, |
| "skip_count": 2.0, |
| "step": 946, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.152901023890785, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.640625, |
| "learning_rate": 0.0009935006015799703, |
| "loss": 0.0893, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1497517.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.014775852672755718, |
| "skip_count": 0.0, |
| "step": 948, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 5.163822525597269, |
| "f1_execute": 0.9629629850387573, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.828125, |
| "learning_rate": 0.0009934424390509017, |
| "loss": 0.1128, |
| "macro_f1": 0.32098767161369324, |
| "num_tokens": 1500944.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08066675066947937, |
| "skip_count": 1.0, |
| "step": 950, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.25, |
| "avg_layers": 27.0, |
| "epoch": 5.174744027303754, |
| "f1_execute": 0.9411765336990356, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.4000000059604645, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0009933840191519584, |
| "loss": 0.0536, |
| "macro_f1": 0.44705885648727417, |
| "num_tokens": 1504267.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.10788286477327347, |
| "skip_count": 4.0, |
| "step": 952, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.3333333432674408, |
| "avg_layers": 28.0, |
| "epoch": 5.1856655290102385, |
| "f1_execute": 0.9600000381469727, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.5, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.0009933253419136107, |
| "loss": 0.0582, |
| "macro_f1": 0.8200000524520874, |
| "num_tokens": 1507688.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.088263139128685, |
| "skip_count": 3.0, |
| "step": 954, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.1965870307167235, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.90625, |
| "learning_rate": 0.000993266407366464, |
| "loss": 0.0989, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1510658.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.005081284325569868, |
| "skip_count": 0.0, |
| "step": 956, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.207508532423208, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.000993207215541257, |
| "loss": 0.0562, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1515152.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.025190535932779312, |
| "skip_count": 2.0, |
| "step": 958, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 29.0, |
| "epoch": 5.2184300341296925, |
| "f1_execute": 1.0, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.3125, |
| "learning_rate": 0.000993147766468863, |
| "loss": 0.0672, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1518790.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.007869229651987553, |
| "skip_count": 0.0, |
| "step": 960, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.2293515358361775, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0009930880601802898, |
| "loss": 0.0658, |
| "macro_f1": 0.5427350401878357, |
| "num_tokens": 1522153.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.15375611186027527, |
| "skip_count": 2.0, |
| "step": 962, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.6666666865348816, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.2402730375426625, |
| "f1_execute": 0.8444444537162781, |
| "f1_repeat": 0.800000011920929, |
| "f1_skip": 0.0, |
| "grad_norm": 5.15625, |
| "learning_rate": 0.0009930280967066787, |
| "loss": 0.1698, |
| "macro_f1": 0.5481481552124023, |
| "num_tokens": 1525054.0, |
| "repeat_count": 3.0, |
| "routers_loss": 0.3285106122493744, |
| "skip_count": 4.0, |
| "step": 964, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 5.251194539249147, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.0009929678760793057, |
| "loss": 0.0853, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1528654.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.06668563932180405, |
| "skip_count": 2.0, |
| "step": 966, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 5.262116040955632, |
| "f1_execute": 0.9166666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.0009929073983295804, |
| "loss": 0.0927, |
| "macro_f1": 0.5277777910232544, |
| "num_tokens": 1531379.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.2843759059906006, |
| "skip_count": 4.0, |
| "step": 968, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 5.273037542662116, |
| "f1_execute": 0.936170220375061, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5714285373687744, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.0009928466634890473, |
| "loss": 0.0759, |
| "macro_f1": 0.502532958984375, |
| "num_tokens": 1534519.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.061425577849149704, |
| "skip_count": 4.0, |
| "step": 970, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 5.283959044368601, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 3.859375, |
| "learning_rate": 0.0009927856715893839, |
| "loss": 0.1502, |
| "macro_f1": 0.4871794879436493, |
| "num_tokens": 1537641.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.12876227498054504, |
| "skip_count": 2.0, |
| "step": 972, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 5.294880546075086, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.5, |
| "grad_norm": 2.703125, |
| "learning_rate": 0.0009927244226624029, |
| "loss": 0.0589, |
| "macro_f1": 0.4803921580314636, |
| "num_tokens": 1540885.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.24013344943523407, |
| "skip_count": 2.0, |
| "step": 974, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 5.30580204778157, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 1.0, |
| "grad_norm": 2.578125, |
| "learning_rate": 0.00099266291674005, |
| "loss": 0.1553, |
| "macro_f1": 0.6666666865348816, |
| "num_tokens": 1545093.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.008588392287492752, |
| "skip_count": 1.0, |
| "step": 976, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.316723549488055, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.000992601153854406, |
| "loss": 0.0732, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1547669.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1047874391078949, |
| "skip_count": 1.0, |
| "step": 978, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 26.0, |
| "epoch": 5.327645051194539, |
| "f1_execute": 0.8571428656578064, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 6.15625, |
| "learning_rate": 0.000992539134037685, |
| "loss": 0.1686, |
| "macro_f1": 0.2857142984867096, |
| "num_tokens": 1550684.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.3830685019493103, |
| "skip_count": 2.0, |
| "step": 980, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.338566552901024, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.296875, |
| "learning_rate": 0.0009924768573222353, |
| "loss": 0.0979, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1553458.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0034001434687525034, |
| "skip_count": 0.0, |
| "step": 982, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.349488054607509, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.0009924143237405392, |
| "loss": 0.0553, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1557067.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0015051440568640828, |
| "skip_count": 0.0, |
| "step": 984, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 25.0, |
| "epoch": 5.360409556313993, |
| "f1_execute": 0.9019607901573181, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 4.375, |
| "learning_rate": 0.0009923515333252128, |
| "loss": 0.0821, |
| "macro_f1": 0.3006536066532135, |
| "num_tokens": 1560210.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.38080108165740967, |
| "skip_count": 2.0, |
| "step": 986, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 1.0, |
| "avg_layers": 27.0, |
| "epoch": 5.371331058020478, |
| "f1_execute": 0.9411764740943909, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 5.34375, |
| "learning_rate": 0.0009922884861090068, |
| "loss": 0.104, |
| "macro_f1": 0.5359477400779724, |
| "num_tokens": 1563164.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.15402451157569885, |
| "skip_count": 1.0, |
| "step": 988, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.382252559726963, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.0009922251821248053, |
| "loss": 0.0596, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1566178.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.0008378620259463787, |
| "skip_count": 0.0, |
| "step": 990, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.393174061433447, |
| "f1_execute": 0.9818181991577148, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 3.5, |
| "learning_rate": 0.0009921616214056258, |
| "loss": 0.0858, |
| "macro_f1": 0.3272727429866791, |
| "num_tokens": 1568705.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.1363816112279892, |
| "skip_count": 1.0, |
| "step": 992, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 26.0, |
| "epoch": 5.404095563139932, |
| "f1_execute": 0.9166666865348816, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.125, |
| "learning_rate": 0.000992097803984621, |
| "loss": 0.0683, |
| "macro_f1": 0.5277777910232544, |
| "num_tokens": 1571934.0, |
| "repeat_count": 2.0, |
| "routers_loss": 0.15122386813163757, |
| "skip_count": 4.0, |
| "step": 994, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 1.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.415017064846416, |
| "f1_execute": 0.9615384340286255, |
| "f1_repeat": 1.0, |
| "f1_skip": 0.0, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.0009920337298950765, |
| "loss": 0.12, |
| "macro_f1": 0.6538461446762085, |
| "num_tokens": 1574947.0, |
| "repeat_count": 1.0, |
| "routers_loss": 0.16266369819641113, |
| "skip_count": 1.0, |
| "step": 996, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.0, |
| "avg_layers": 28.0, |
| "epoch": 5.425938566552901, |
| "f1_execute": 1.0, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.0, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.0009919693991704123, |
| "loss": 0.0627, |
| "macro_f1": 0.3333333432674408, |
| "num_tokens": 1577895.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.002958054654300213, |
| "skip_count": 0.0, |
| "step": 998, |
| "text_loss": 0.0 |
| }, |
| { |
| "acc_repeat": 0.0, |
| "acc_skip": 0.5, |
| "avg_layers": 27.0, |
| "epoch": 5.436860068259386, |
| "f1_execute": 0.9811320900917053, |
| "f1_repeat": 0.0, |
| "f1_skip": 0.6666666865348816, |
| "grad_norm": 3.703125, |
| "learning_rate": 0.0009919048118441818, |
| "loss": 0.1173, |
| "macro_f1": 0.5492662787437439, |
| "num_tokens": 1581513.0, |
| "repeat_count": 0.0, |
| "routers_loss": 0.08616811782121658, |
| "skip_count": 2.0, |
| "step": 1000, |
| "text_loss": 0.0 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 9200, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.7215681060599736e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|